From 88e6fa7fdb1fbaf576237d95744c901c3699030b Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 18 Oct 2024 08:25:54 -0700
Subject: [PATCH 001/153] add the lsr-drop-solution=1 compiler flag (#1582)

---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfcfa24b3..0700fe838 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,6 +202,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
     add_compile_options(-fno-offload-uniform-block)
   endif()
 endif()
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
+  check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
+  if(HAS_LSR_DROP_SOLUTION)
+    message("Adding the lsr-drop-solution=1 compiler flag")
+    add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
+  endif()
+endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
   check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
   if(HAS_ENABLE_POST_MISCHED)
-- 
GitLab


From a285d6f9b5c8ada9f306fae9724d6788060e7e2a Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Fri, 18 Oct 2024 23:46:11 +0800
Subject: [PATCH 002/153] disable bad instance detected on MI308CPX (#1584)

---
 .../device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp
index 5cebad491..5c525244e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp
@@ -46,7 +46,7 @@ using device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   192,   256,    64,    16,   8,  32,   32,    3,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,    16,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<8,  32, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,        
+        // DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<8,  32, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,        
         // We prefer following instance, however, existing compiler bug cause it failed to generate sanity code.
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-- 
GitLab


From 95e722a3b357334fe05b0a7f217b60c591592967 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Mon, 21 Oct 2024 10:52:11 +0800
Subject: [PATCH 003/153] [CK_TILE] Optimize fmha splitkv & splitkv combine
 kernels (#1577)

* Use smaller width for lse_accum dist tensor

* Update pipeline comment

* Fix wrong distribution for lse_accum

* Remove duplicate dim in lse_accum dist encoding

* Decide fmha splitkv combine kernel kBlockSize by kM0

* Remove assumption of MPerThread=1

* Add log<4> & log<8> specialization

* Enlarge occupancy array

* Fix vector size for small tile

* Add support for kMaxSplits=8

* Re-format gemm.hpp

* Use 16x16x16 warp gemm for fwd_splitkv

* Centralize policy code changes

* Leave fp8/bf8 tile settings unchanged
---
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   | 30 +++++----
 ...lock_fmha_fwd_splitkv_combine_pipeline.hpp | 27 +++++---
 ...plitkv_combine_pipeline_default_policy.hpp | 67 +++++++++++++------
 .../pipeline/block_fmha_pipeline_problem.hpp  |  3 +-
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 15 ++++-
 include/ck_tile/ops/gemm.hpp                  |  2 +-
 6 files changed, 96 insertions(+), 48 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 82cf3a5ab..57360ea99 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -191,7 +191,9 @@ using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_m
 template<>
 void fmha_fwd_splitkv_combine_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
-    if (a.num_splits <= 16) {{
+    if (a.num_splits <= 8) {{
+        kernel_runner<3>::run(s, a);
+    }} else if (a.num_splits <= 16) {{
         kernel_runner<4>::run(s, a);
     }} else if (a.num_splits <= 32) {{
         kernel_runner<5>::run(s, a);
@@ -239,7 +241,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
                 using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
-                using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
+                using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
 
                 return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
             }}
@@ -551,14 +553,14 @@ class FmhaFwdSplitKVCombineKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-                '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1, 32, 32, 16, -1),
-                '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1, 32, 32, 16, -1),
-                '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1, 32, 32, 16, -1),
-                '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1, 32, 32, 16, -1),
+            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32, 32,   2, 1, 1, 16, 16, 16, -1),
+            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32, 64,   4, 1, 1, 16, 16, 16, -1),
+            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128,  4, 1, 1, 16, 16, 16, -1),
+            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256,  4, 1, 1, 16, 16, 16, -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     2, 1, 1, 32, 32, 32, -1),
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32, 64,   2, 1, 1, 32, 32, 32, -1),
             '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1, 32, 32, 32, -1),
             '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1, 32, 32, 32, -1)
         }
@@ -568,16 +570,16 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
 def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-                '32'  : FmhaFwdSplitKVCombineTileSize(64, 32, -1),
-                '64'  : FmhaFwdSplitKVCombineTileSize(64, 64, -1),
-                '128' : FmhaFwdSplitKVCombineTileSize(64, 128, -1),
-                '256' : FmhaFwdSplitKVCombineTileSize(64, 256, -1),
+            '32'  : FmhaFwdSplitKVCombineTileSize(16, 16,  -1),
+            '64'  : FmhaFwdSplitKVCombineTileSize(32, 32,  -1),
+            '128' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
+            '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1),
     }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-                '64'  : FmhaFwdSplitKVCombineTileSize(64, 64, -1),
-                '128' : FmhaFwdSplitKVCombineTileSize(64, 128, -1),
-                '256' : FmhaFwdSplitKVCombineTileSize(64, 256, -1),
+            '64'  : FmhaFwdSplitKVCombineTileSize(64, 32,  -1),
+            '128' : FmhaFwdSplitKVCombineTileSize(64, 64,  -1),
+            '256' : FmhaFwdSplitKVCombineTileSize(64, 128, -1),
         }
     else:
         return None
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
index 1afe0feab..7c49fce99 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
@@ -12,6 +12,16 @@ namespace detail {
 template <index_t N>
 struct log2;
 
+template <>
+struct log2<4> : std::integral_constant<index_t, 2>
+{
+};
+
+template <>
+struct log2<8> : std::integral_constant<index_t, 3>
+{
+};
+
 template <>
 struct log2<16> : std::integral_constant<index_t, 4>
 {
@@ -72,18 +82,18 @@ struct BlockFmhaFwdSplitKVCombinePipeline
         {
             if constexpr(kHeadDimV <= 32)
             {
-                constexpr std::array<int, 4> occupancy{3, 3, 3, 1};
-                return occupancy[detail::log2<kMaxSplits>::value - 4];
+                constexpr std::array occupancy{3, 3, 3, 3, 3, 1};
+                return occupancy[detail::log2<kMaxSplits>::value - 2];
             }
             else if constexpr(kHeadDimV <= 128)
             {
-                constexpr std::array<int, 4> occupancy{3, 3, 2, 1};
-                return occupancy[detail::log2<kMaxSplits>::value - 4];
+                constexpr std::array occupancy{3, 3, 3, 3, 2, 1};
+                return occupancy[detail::log2<kMaxSplits>::value - 2];
             }
             else if constexpr(kHeadDimV <= 256)
             {
-                constexpr std::array<int, 4> occupancy{2, 2, 2, 1};
-                return occupancy[detail::log2<kMaxSplits>::value - 4];
+                constexpr std::array occupancy{2, 2, 2, 2, 2, 1};
+                return occupancy[detail::log2<kMaxSplits>::value - 2];
             }
         }
     }();
@@ -138,9 +148,8 @@ struct BlockFmhaFwdSplitKVCombinePipeline
         auto lse_accum = make_static_distributed_tensor<LSEDataType>(
             Policy::template MakeLSEaccRegTileDistribution<Problem>());
 
-        // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, max(kMaxSplits, warp_size)])
-        // this will extend the distributed tensor width so that each thread in wave have data to
-        // reduce.
+        // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, kMaxSplits])
+        // and fill up -INF values outside the [kM0, num_splits] region.
         {
             constexpr auto spans = decltype(lse_accum)::get_distributed_spans();
             sweep_tile_span(spans[number<0>{}], [&](auto idx0) {
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
index 3327d4af8..ebd69c0cf 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
@@ -10,11 +10,26 @@ namespace ck_tile {
 
 struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
 {
+    template <index_t BlockSize, index_t M, index_t N, typename DataType>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeForTile()
+    {
+        constexpr index_t PixelsPerThread = (M * N) / BlockSize;
+        static_assert(0 < PixelsPerThread);
+
+        constexpr index_t MaxNPerThread = 16 / sizeof(DataType);
+        constexpr index_t NPerThread    = min(MaxNPerThread, PixelsPerThread);
+
+        return NPerThread;
+    }
+
+    // alignment for dram lse tile (shape=[kMaxSplits, kM0])
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentLSE()
     {
-        using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
-        return 16 / sizeof(LSEDataType);
+        return GetVectorSizeForTile<Problem::kBlockSize,
+                                    Problem::kMaxSplits,
+                                    Problem::kM0,
+                                    typename Problem::LSEDataType>();
     }
 
     template <typename Problem>
@@ -47,29 +62,31 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
                MakeLSEaccLdsBlockDescriptor<Problem>().get_element_space_size();
     }
 
+    // shape=[kMaxSplits, kM0]
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccDramTileDistribution()
     {
         using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
 
         constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNumWarps  = Problem::kNumWarps;
 
         constexpr index_t kNPerBlock = Problem::kM0;
         constexpr index_t kMPerBlock = Problem::kMaxSplits;
 
-        constexpr index_t NPerThread = 16 / sizeof(LSEDataType);
-        constexpr index_t NThreads   = kNPerBlock / NPerThread;
+        constexpr index_t NPerThread =
+            GetVectorSizeForTile<kBlockSize, kMPerBlock, kNPerBlock, LSEDataType>();
+        constexpr index_t NThreads = kNPerBlock / NPerThread;
 
         constexpr index_t MThreadsPerWarp = get_warp_size() / NThreads;
-        constexpr index_t TotalWarps      = kBlockSize / get_warp_size();
-        constexpr index_t MPerThread      = kMPerBlock / (TotalWarps * MThreadsPerWarp);
+        constexpr index_t MPerThread      = kMPerBlock / (kNumWarps * MThreadsPerWarp);
 
         static_assert(NThreads * NPerThread == kNPerBlock);
-        static_assert(MPerThread * TotalWarps * MThreadsPerWarp == kMPerBlock);
+        static_assert(MPerThread * kNumWarps * MThreadsPerWarp == kMPerBlock);
 
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<MPerThread, TotalWarps, MThreadsPerWarp>,
+                                       tuple<sequence<MPerThread, kNumWarps, MThreadsPerWarp>,
                                              sequence<NThreads, NPerThread>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<1>, sequence<2, 0>>,
@@ -77,15 +94,18 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
                                        sequence<0, 1>>{});
     }
 
-    // 3d + padding, [kMaxSplits, kM0]
+    // 3d + padding, shape=[kMaxSplits, kM0]
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccLdsStoreBlockDescriptor()
     {
         using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
 
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+
         constexpr index_t kMPerBlock = Problem::kMaxSplits;
         constexpr index_t kNPerBlock = Problem::kM0;
-        constexpr index_t NPack      = 16 / sizeof(LSEDataType);
+        constexpr index_t NPack =
+            GetVectorSizeForTile<kBlockSize, kMPerBlock, kNPerBlock, LSEDataType>();
 
         constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kNPerBlock / NPack>{}, number<kMPerBlock>{}, number<NPack>{}),
@@ -103,15 +123,18 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
         return lse_acc_lds_block_desc;
     }
 
-    // 3d + padding, [kM0, kMaxSplits]
+    // 3d + padding, shape=[kM0, kMaxSplits]
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccLdsBlockDescriptor()
     {
         using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
 
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+
         constexpr index_t kMPerBlock = Problem::kMaxSplits;
         constexpr index_t kNPerBlock = Problem::kM0;
-        constexpr index_t NPack      = 16 / sizeof(LSEDataType);
+        constexpr index_t NPack =
+            GetVectorSizeForTile<kBlockSize, kMPerBlock, kNPerBlock, LSEDataType>();
 
         constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kNPerBlock / NPack>{}, number<kMPerBlock>{}, number<NPack>{}),
@@ -134,26 +157,28 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
     {
         constexpr index_t kBlockSize = Problem::kBlockSize;
 
-        constexpr index_t kNPerBlock = max(Problem::kMaxSplits, get_warp_size());
+        constexpr index_t kNPerBlock = Problem::kMaxSplits;
         constexpr index_t kMPerBlock = Problem::kM0;
 
-        constexpr index_t NThreads   = get_warp_size();
+        constexpr index_t NThreads   = 4;
         constexpr index_t NPerThread = kNPerBlock / NThreads;
 
-        constexpr index_t MThreads   = kBlockSize / NThreads;
-        constexpr index_t MPerThread = kMPerBlock / MThreads;
+        constexpr index_t MThreads       = kBlockSize / NThreads;
+        constexpr index_t MPerThread     = kMPerBlock / MThreads;
+        constexpr index_t MWarps         = kBlockSize / get_warp_size();
+        constexpr index_t MThreadPerWarp = get_warp_size() / NThreads;
 
         static_assert(NThreads * NPerThread == kNPerBlock);
-        static_assert(MThreads * MPerThread == kMPerBlock);
+        static_assert(MWarps * MThreadPerWarp * MPerThread == kMPerBlock);
 
         return make_static_tile_distribution(
             tile_distribution_encoding<
                 sequence<1>,
-                tuple<sequence<MThreads, MPerThread>, sequence<NThreads, NPerThread>>,
-                tuple<sequence<1>, sequence<2>>,
-                tuple<sequence<0>, sequence<0>>,
+                tuple<sequence<MWarps, MThreadPerWarp, MPerThread>, sequence<NThreads, NPerThread>>,
+                tuple<sequence<1>, sequence<2, 1>>,
+                tuple<sequence<0>, sequence<0, 1>>,
                 sequence<1, 2>,
-                sequence<1, 1>>{});
+                sequence<2, 1>>{});
     }
 
     template <typename Problem>
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index d254f07e2..1846664e7 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -115,7 +115,8 @@ struct BlockFmhaSplitKVCombinePipelineProblem
     using ODataType    = remove_cvref_t<ODataType_>;
     using Traits       = remove_cvref_t<Traits_>;
 
-    static constexpr index_t kBlockSize = 256;
+    static constexpr index_t kNumWarps  = kM0_ / (get_warp_size() / 4);
+    static constexpr index_t kBlockSize = kNumWarps * get_warp_size();
     static constexpr bool kIsGroupMode  = kIsGroupMode_;
 
     static constexpr index_t kHeadDimV = HeadDimV_;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 8fa325241..a66d2be78 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -88,22 +88,33 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
         constexpr auto warp_gemm = []() {
+            constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
+            static_assert(WarpGemmM == 16 || WarpGemmM == 32);
+
             if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
                          std::is_same_v<typename Problem::KDataType, half_t> &&
                          std::is_same_v<typename Problem::SaccDataType, float>)
             {
-                return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
+                if constexpr(WarpGemmM == 32)
+                    return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
+                else // WarpGemmM == 16
+                    return WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution{};
             }
             else if constexpr(std::is_same_v<typename Problem::QDataType, bf16_t> &&
                               std::is_same_v<typename Problem::KDataType, bf16_t> &&
                               std::is_same_v<typename Problem::SaccDataType, float>)
             {
-                return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
+                if constexpr(WarpGemmM == 32)
+                    return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
+                else // WarpGemmM == 16
+                    return WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution{};
             }
             else if constexpr(std::is_same_v<typename Problem::QDataType, fp8_t> &&
                               std::is_same_v<typename Problem::KDataType, fp8_t> &&
                               std::is_same_v<typename Problem::SaccDataType, float>)
             {
+                static_assert(WarpGemmM == 32);
+
                 // TODO: hard coded here. Otherwise, it may incorrect result
                 constexpr index_t swizzle_factor = 4;
                 return WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution<
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 436d964c3..e70825570 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -23,12 +23,12 @@
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
-- 
GitLab


From 560917b1610eded84d4383c6927a2a2b8704b2a4 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 21 Oct 2024 22:47:48 +0800
Subject: [PATCH 004/153] Ck profiler instance support (#1575)

* The draft on ckProfiler instance add

* support the ck profiler instance with same data types

* add a small feature on the M and N variable switch.

* Partially solve the incorrect result problem

* fix based on ci cd
---
 ..._xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp |  9 +++++++-
 ..._xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp | 17 ++++++++++++--
 ..._xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp |  6 +++++
 ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp |  6 +++++
 ...emm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp | 16 +++++++++++--
 ...emm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp | 23 ++++++++++++++++++-
 profiler/src/profile_gemm_universal.cpp       | 23 +++++++++++++++----
 7 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
index 615711147..3300c4b0f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -44,8 +44,11 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tu
         
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         // Can we support this kind of odd case? 224(256) = 28*8 + (4*8)
         //DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -64,10 +67,13 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tup
 
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -75,7 +81,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tup
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   4,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   4,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
index 32a7d640d..d7b005118 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -44,13 +44,21 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu
         
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -64,18 +72,23 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tup
 
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   8,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   8,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   8,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   8,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   8,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 2b1e84f0c..9566d5555 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -43,6 +43,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tu
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -63,14 +65,18 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tup
 
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index d56771823..72162b65d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -44,6 +44,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tu
         
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill
@@ -69,8 +71,12 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tup
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   4,   4,  16,   16,    4,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
index d34c83a60..af9494f5a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -41,6 +41,8 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -49,7 +51,9 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -63,12 +67,19 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
 
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   4,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   2,   2,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   4,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -82,6 +93,7 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
index ca90efa4c..f9d693f45 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -42,11 +42,20 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple
         
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   4,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   2,   2,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  16,   16,    8,    8,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         // AGPR Spill
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
@@ -70,13 +79,21 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
 
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   4,   4,  32,   32,    2,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   4,   4,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -84,12 +101,16 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   4,   4,  32,   32,    1,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   2,   2,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index a2ef11713..f86dddc72 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -57,6 +57,25 @@ int profile_gemm_universal(int argc, char* argv[])
         exit(1);
     }
 
+    int M;
+    int N;
+    int StrideA;
+    int StrideB;
+    // Analyze the unsupported matrix shapes, switch the M and N number
+    if(std::stoi(argv[9]) % 8 != 0 && std::stoi(argv[8]) % 8 == 0)
+    {
+        M       = std::stoi(argv[9]);
+        StrideA = std::stoi(argv[12]);
+        N       = std::stoi(argv[8]);
+        StrideB = std::stoi(argv[11]);
+    }
+    else
+    {
+        M       = std::stoi(argv[8]);
+        StrideA = std::stoi(argv[11]);
+        N       = std::stoi(argv[9]);
+        StrideB = std::stoi(argv[12]);
+    }
     const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
     const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
     const bool do_verification = std::stoi(argv[4]);
@@ -64,12 +83,8 @@ int profile_gemm_universal(int argc, char* argv[])
     const bool do_log          = std::stoi(argv[6]);
     const bool time_kernel     = std::stoi(argv[7]);
 
-    const int M = std::stoi(argv[8]);
-    const int N = std::stoi(argv[9]);
     const int K = std::stoi(argv[10]);
 
-    const int StrideA = std::stoi(argv[11]);
-    const int StrideB = std::stoi(argv[12]);
     const int StrideC = std::stoi(argv[13]);
     const int KBatch  = std::stoi(argv[14]);
 
-- 
GitLab


From d0565e33d6eb8f0c464080dcbd8f879250ca5067 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 21 Oct 2024 08:34:53 -0700
Subject: [PATCH 005/153] Bump rocm-docs-core from 1.8.2 to 1.8.3 in
 /docs/sphinx (#1587)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.2 to 1.8.3.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.2...v1.8.3)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index fa1897e23..c2220e15d 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.8.2
+rocm-docs-core==1.8.3
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 7d0c92d04..0dc2e70c5 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.8.2
+rocm-docs-core==1.8.3
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From 794f2d64a8a03a1408126332451a7e75f589d4ef Mon Sep 17 00:00:00 2001
From: spolifroni-amd <Sandra.Polifroni@amd.com>
Date: Mon, 21 Oct 2024 11:35:57 -0400
Subject: [PATCH 006/153] added link to documentation (#1578)

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 4366ec032..053406515 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 # Composable Kernel
 
+> [!NOTE]
+> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
+
 The Composable Kernel (CK) library provides a programming model for writing performance-critical
 kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
 uses general purpose kernel languages, such as HIP C++.
-- 
GitLab


From 3f710930f6f570e47025a30286ce12a1a3549bb7 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Mon, 21 Oct 2024 10:45:22 -0500
Subject: [PATCH 007/153] Update default stride (#1576)

* Update default stride value to -1

* Fix format

* Revert "Fix format"

This reverts commit ae0c3649ec48e330bb162cd6a12fd3d2e3bee64a.

---------

Co-authored-by: Harisankar Sadasivan <135730918+hsadasiv@users.noreply.github.com>
---
 example/01_gemm/common.hpp                    | 24 +++++++++----------
 example/01_gemm/run_gemm_example.inc          | 12 +++++-----
 .../01_gemm/run_gemm_example_streamk_v2.inc   |  4 ++--
 example/01_gemm/run_gemm_example_v2.inc       | 12 +++++-----
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index eb1738e76..d08196924 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -29,9 +29,9 @@ struct ProblemSize final
     ck::index_t N = 4096;
     ck::index_t K = 4096;
 
-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;
 };
 
 struct ProblemSizeStreamK final
@@ -40,9 +40,9 @@ struct ProblemSizeStreamK final
     ck::index_t N = 4096;
     ck::index_t K = 4096;
 
-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;
 
     ck::index_t NumSKBlocks = -1;
 };
@@ -52,9 +52,9 @@ struct ProblemSizeStreamK_universal final
     ck::index_t N = 4096;
     ck::index_t K = 4096;
 
-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;
 
     ck::index_t Grid_size   = -1; // defaults to max occupancy
     ck::index_t Streamk_sel = 1;  // defaults to 1-tile SK
@@ -66,9 +66,9 @@ struct ProblemSizeSplitK final
     ck::index_t N = 4096;
     ck::index_t K = 4096;
 
-    ck::index_t StrideA = 0;
-    ck::index_t StrideB = 0;
-    ck::index_t StrideC = 0;
+    ck::index_t StrideA = -1;
+    ck::index_t StrideB = -1;
+    ck::index_t StrideC = -1;
 
     ck::index_t KBatch = 1;
 };
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index f66d2adc1..fe12998e3 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -116,21 +116,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         };
 
     auto f_get_default_stride =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(stride == 0)
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
             {
-                // give a chance if stride is zero, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                 {
-                    return col;
+                    return static_cast<std::size_t>(col);
                 }
                 else
                 {
-                    return row;
+                    return static_cast<std::size_t>(row);
                 }
             }
             else
-                return stride;
+                return static_cast<std::size_t>(stride);
         };
 
     StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc
index 32bd3a19a..6679f9515 100644
--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
     auto f_get_default_stride =
         [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == 0)
+            if(stride == -1)
             {
-                // give a chance if stride is 0, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                 {
                     return static_cast<std::size_t>(col);
diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc
index ad7238f0d..0bcee658b 100644
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -115,21 +115,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         };
 
     auto f_get_default_stride =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(stride == 0)
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
             {
-                // give a chance if stride is zero, return a default packed stride
+                // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
                 {
-                    return col;
+                    return static_cast<std::size_t>(col);
                 }
                 else
                 {
-                    return row;
+                    return static_cast<std::size_t>(row);
                 }
             }
             else
-                return stride;
+                return static_cast<std::size_t>(stride);
         };
 
     StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
-- 
GitLab


From 0394f8a713d40aae40339691e8ab980823d76a54 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Tue, 22 Oct 2024 09:26:18 +0800
Subject: [PATCH 008/153] update layernorm (#1570)

* port layernorm

* change warp_welford.hpp

* Update warpshuffle

* 1. Add save mean and save std back
2. Move construction of tensor_view and tile_window to operator()

* refine welford max count calculation

* unify layernorm api

* Rename file

* Remove save mean and inv std

* Revert "refine welford max count calculation"

This reverts commit 022365802b43a398deee2bc672785fa31a89297d.

* Fix order of parameter

* refine welford max count calculation again

* Remove fp32 instances

* Fix bug of padding

* refactor api

* Support bf16

* Extract common function

* Refine arg of operator()

* Add kMThreadPerBlock to template parameter

* clang format

* Refine variable name

* Refine file name

* remove redundant line

* refactor layernorm2d pipeline and add block-per-block utility

* fix name

* rename more

* add more block-per-tile instance

* remove duplicated define

* update instance for 2048, 1024 case

* support up to 2048 now

* opt loading

* add n1536

* Add two pass pipeline

* format

* Fix incorrect type

* parallel compilation

* Use smaller N

* fix 2p pass

* Support Repeat_M in distribution

* Refine nameing

* Add reduce example

---------

Co-authored-by: letaoqin <letaoqin@amd.com>
Co-authored-by: aska-0096 <haocwang@amd.com>
Co-authored-by: rocking <ChunYu.Lai@amd.com>
Co-authored-by: carlushuang <carlus.huang@amd.com>
---
 ...n_complex_contraction_bilinear_example.inc | 223 ++++----
 example/ck_tile/02_layernorm2d/CMakeLists.txt |  21 +-
 example/ck_tile/02_layernorm2d/README.md      |   5 +-
 .../instances/layernorm2d_fwd_api.cpp         | 155 ++++++
 .../layernorm2d_fwd_bf16_n1024_instance.cpp   |  22 +
 .../layernorm2d_fwd_bf16_n1536_instance.cpp   |  13 +
 .../layernorm2d_fwd_bf16_n2048_instance.cpp   |  14 +
 .../layernorm2d_fwd_bf16_n256_instance.cpp    |  12 +
 .../layernorm2d_fwd_bf16_n3072_instance.cpp   |  14 +
 .../layernorm2d_fwd_bf16_n4096_instance.cpp   |  14 +
 ...layernorm2d_fwd_bf16_n4096_tp_instance.cpp |  14 +
 .../layernorm2d_fwd_bf16_n512_instance.cpp    |  13 +
 ...layernorm2d_fwd_bf16_n64_n128_instance.cpp |  12 +
 .../layernorm2d_fwd_bf16_n768_instance.cpp    |  12 +
 .../layernorm2d_fwd_fp16_n1024_instance.cpp   |  22 +
 .../layernorm2d_fwd_fp16_n1536_instance.cpp   |  13 +
 .../layernorm2d_fwd_fp16_n2048_instance.cpp   |  14 +
 .../layernorm2d_fwd_fp16_n256_instance.cpp    |  12 +
 .../layernorm2d_fwd_fp16_n3072_instance.cpp   |  14 +
 .../layernorm2d_fwd_fp16_n4096_instance.cpp   |  14 +
 ...layernorm2d_fwd_fp16_n4096_tp_instance.cpp |  14 +
 .../layernorm2d_fwd_fp16_n512_instance.cpp    |  13 +
 ...layernorm2d_fwd_fp16_n64_n128_instance.cpp |  12 +
 .../layernorm2d_fwd_fp16_n768_instance.cpp    |  12 +
 .../layernorm2d_fwd_instance_common.hpp       |  67 +++
 .../02_layernorm2d/layernorm2d_fwd.cpp        | 236 ++++-----
 .../02_layernorm2d/layernorm2d_fwd.hpp        | 117 +++-
 .../02_layernorm2d/script/perf_test.sh        |  38 ++
 .../02_layernorm2d/script/smoke_test.sh       |  31 ++
 example/ck_tile/05_reduce/CMakeLists.txt      |  19 +
 example/ck_tile/05_reduce/reduce.cpp          | 110 ++++
 example/ck_tile/05_reduce/reduce.hpp          | 118 +++++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/core.hpp                      |   1 +
 include/ck_tile/core/arch/utility.hpp         |  43 ++
 include/ck_tile/core/config.hpp               |   2 +
 include/ck_tile/core/container/sequence.hpp   | 122 +++++
 include/ck_tile/core/container/tuple.hpp      |  20 +
 .../core/tensor/static_distributed_tensor.hpp |  14 +
 include/ck_tile/core/tensor/sweep_tile.hpp    | 278 ++++++++++
 .../ck_tile/core/tensor/tile_distribution.hpp | 158 ++----
 .../core/utility/functional_with_tuple.hpp    | 173 ++++++
 include/ck_tile/host.hpp                      |   2 +-
 ...rm2d.hpp => reference_layernorm2d_fwd.hpp} |   0
 include/ck_tile/ops/layernorm2d.hpp           |   7 +-
 .../kernel/layernorm2d_fwd_kernel.hpp         | 499 ++++++------------
 .../kernel/layernorm2d_fwd_shape.hpp          |  78 +++
 .../block_layernorm2d_fwd_problem.hpp         |  34 --
 ...ayernorm2d_fwd_pipeline_default_policy.hpp |  99 ++++
 .../layernorm2d_fwd_pipeline_one_pass.hpp     | 119 +++++
 .../layernorm2d_fwd_pipeline_problem.hpp      |  40 ++
 .../layernorm2d_fwd_pipeline_two_pass.hpp     | 160 ++++++
 .../pipeline/tile_layernorm2d_fwd_shape.hpp   |  35 --
 .../ck_tile/ops/reduce/block/block_reduce.hpp |   2 +-
 include/ck_tile/ops/welford.hpp               |   3 +-
 .../ops/welford/block/block_welford.hpp       | 362 +++++++++++++
 .../welford/block/block_welford_problem.hpp   |  18 +
 .../ops/welford/thread/thread_welford.hpp     | 113 +---
 .../ck_tile/ops/welford/warp/warp_welford.hpp | 154 ------
 59 files changed, 2916 insertions(+), 1041 deletions(-)
 mode change 100755 => 100644 example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp
 create mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp
 create mode 100755 example/ck_tile/02_layernorm2d/script/perf_test.sh
 create mode 100755 example/ck_tile/02_layernorm2d/script/smoke_test.sh
 create mode 100644 example/ck_tile/05_reduce/CMakeLists.txt
 create mode 100644 example/ck_tile/05_reduce/reduce.cpp
 create mode 100644 example/ck_tile/05_reduce/reduce.hpp
 create mode 100644 include/ck_tile/core/utility/functional_with_tuple.hpp
 rename include/ck_tile/host/reference/{reference_layernorm2d.hpp => reference_layernorm2d_fwd.hpp} (100%)
 create mode 100644 include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp
 delete mode 100644 include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
 create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
 create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
 create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
 create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
 delete mode 100644 include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp
 create mode 100644 include/ck_tile/ops/welford/block/block_welford.hpp
 create mode 100644 include/ck_tile/ops/welford/block/block_welford_problem.hpp
 delete mode 100644 include/ck_tile/ops/welford/warp/warp_welford.hpp

diff --git a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
old mode 100755
new mode 100644
index b54842754..82ac0a15e
--- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
+++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
@@ -127,44 +127,47 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
 
     switch(init_method)
     {
-        case 0: break;
-        case 1:
+    case 0: break;
+    case 1:
 
-            a_ms_ks_re.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-            b_ns_ks_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            d_ms_ns_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        a_ms_ks_re.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
 
-            a_ms_ks_img.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-            b_ns_ks_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            d_ms_ns_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-            break;
+        a_ms_ks_img.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
 
-        default:
-            a_ms_ks_re.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-            b_ns_ks_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-            d_ms_ns_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    default:
+        a_ms_ks_re.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
 
-            a_ms_ks_img.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-            b_ns_ks_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-            d_ms_ns_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        a_ms_ks_img.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
 
-            break;
+        break;
     }
 
     DeviceMem a_device_buf_re(sizeof(ADataType) * a_ms_ks_re.mDesc.GetElementSpaceSize());
     DeviceMem b_device_buf_re(sizeof(BDataType) * b_ns_ks_re.mDesc.GetElementSpaceSize());
     DeviceMem d_device_buf_re(sizeof(DDataType) * d_ms_ns_re.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf_re(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_re(sizeof(EDataType) *
+                              e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
 
     DeviceMem a_device_buf_img(sizeof(ADataType) * a_ms_ks_img.mDesc.GetElementSpaceSize());
     DeviceMem b_device_buf_img(sizeof(BDataType) * b_ns_ks_img.mDesc.GetElementSpaceSize());
     DeviceMem d_device_buf_img(sizeof(DDataType) * d_ms_ns_img.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf_img(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_img(sizeof(EDataType) *
+                               e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
 
     // Intermediate Value For E Real and Img
-    DeviceMem e_device_buf_re1(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf_img1(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
-
+    DeviceMem e_device_buf_re1(sizeof(EDataType) *
+                               e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_img1(sizeof(EDataType) *
+                                e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
 
     a_device_buf_re.ToDevice(a_ms_ks_re.mData.data());
     b_device_buf_re.ToDevice(b_ns_ks_re.mData.data());
@@ -181,7 +184,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     // set zero for intermediate values
     e_device_buf_re1.SetZero();
     e_device_buf_img1.SetZero();
- 
+
     auto a_element_op   = AElementOp{};
     auto b_element_op   = BElementOp{};
     auto cde_element_op = CDEElementOp{alpha, beta};
@@ -189,23 +192,24 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     // device operation
     // For real Intermediate Value re_1
 
-    auto op       = DeviceOpInstance{};
-    auto invoker  = op.MakeInvoker();
-    auto argument_re1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
-                                    b_device_buf_re.GetDeviceBuffer(),
-                                    std::array<const void*, 1>{d_device_buf_re.GetDeviceBuffer()},
-                                    e_device_buf_re1.GetDeviceBuffer(),
-                                    a_ms_ks_lengths,
-                                    a_ms_ks_strides,
-                                    b_ns_ks_lengths,
-                                    b_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                    e_ms_ns_lengths,
-                                    e_ms_ns_strides,
-                                    a_element_op,
-                                    b_element_op,
-                                    cde_element_op);
+    auto op      = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+    auto argument_re1 =
+        op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
+                        b_device_buf_re.GetDeviceBuffer(),
+                        std::array<const void*, 1>{d_device_buf_re.GetDeviceBuffer()},
+                        e_device_buf_re1.GetDeviceBuffer(),
+                        a_ms_ks_lengths,
+                        a_ms_ks_strides,
+                        b_ns_ks_lengths,
+                        b_ns_ks_strides,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        e_ms_ns_lengths,
+                        e_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op);
 
     if(!op.IsSupportedArgument(argument_re1))
     {
@@ -216,7 +220,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
 
     float ave_time_re1 = invoker.Run(argument_re1, StreamConfig{nullptr, time_kernel});
 
-
     alpha = -1.f;
     beta  = 1.f;
 
@@ -228,21 +231,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     // For real Intermediate Value re_2
     // auto op       = DeviceOpInstance{};
     // auto invoker  = op.MakeInvoker();
-    auto argument_re2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
-                                    b_device_buf_img.GetDeviceBuffer(),
-                                    std::array<const void*, 1>{e_device_buf_re1.GetDeviceBuffer()},
-                                    e_device_buf_re.GetDeviceBuffer(),
-                                    a_ms_ks_lengths,
-                                    a_ms_ks_strides,
-                                    b_ns_ks_lengths,
-                                    b_ns_ks_strides,
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                    e_ms_ns_lengths,
-                                    e_ms_ns_strides,
-                                    a_element_op,
-                                    b_element_op,
-                                    cde_element_op);
+    auto argument_re2 =
+        op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
+                        b_device_buf_img.GetDeviceBuffer(),
+                        std::array<const void*, 1>{e_device_buf_re1.GetDeviceBuffer()},
+                        e_device_buf_re.GetDeviceBuffer(),
+                        a_ms_ks_lengths,
+                        a_ms_ks_strides,
+                        b_ns_ks_lengths,
+                        b_ns_ks_strides,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        e_ms_ns_lengths,
+                        e_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op);
 
     if(!op.IsSupportedArgument(argument_re2))
     {
@@ -253,7 +257,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
 
     float ave_time_re2 = invoker.Run(argument_re2, StreamConfig{nullptr, time_kernel});
 
-    
     alpha = 1.f;
     beta  = 1.f;
 
@@ -261,22 +264,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     b_element_op   = BElementOp{};
     cde_element_op = CDEElementOp{alpha, beta};
 
-    auto argument_img1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
-                                b_device_buf_img.GetDeviceBuffer(),
-                                std::array<const void*, 1>{d_device_buf_img.GetDeviceBuffer()},
-                                e_device_buf_img1.GetDeviceBuffer(),
-                                a_ms_ks_lengths,
-                                a_ms_ks_strides,
-                                b_ns_ks_lengths,
-                                b_ns_ks_strides,
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                e_ms_ns_lengths,
-                                e_ms_ns_strides,
-                                a_element_op,
-                                b_element_op,
-                                cde_element_op);
-
+    auto argument_img1 =
+        op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
+                        b_device_buf_img.GetDeviceBuffer(),
+                        std::array<const void*, 1>{d_device_buf_img.GetDeviceBuffer()},
+                        e_device_buf_img1.GetDeviceBuffer(),
+                        a_ms_ks_lengths,
+                        a_ms_ks_strides,
+                        b_ns_ks_lengths,
+                        b_ns_ks_strides,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        e_ms_ns_lengths,
+                        e_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op);
 
     if(!op.IsSupportedArgument(argument_img1))
     {
@@ -290,23 +293,22 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     alpha = 1.f;
     beta  = 1.f;
 
-    auto argument_img2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
-                                b_device_buf_re.GetDeviceBuffer(),
-                                std::array<const void*, 1>{e_device_buf_img1.GetDeviceBuffer()},
-                                e_device_buf_img.GetDeviceBuffer(),
-                                a_ms_ks_lengths,
-                                a_ms_ks_strides,
-                                b_ns_ks_lengths,
-                                b_ns_ks_strides,
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
-                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
-                                e_ms_ns_lengths,
-                                e_ms_ns_strides,
-                                a_element_op,
-                                b_element_op,
-                                cde_element_op);
-
-
+    auto argument_img2 =
+        op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
+                        b_device_buf_re.GetDeviceBuffer(),
+                        std::array<const void*, 1>{e_device_buf_img1.GetDeviceBuffer()},
+                        e_device_buf_img.GetDeviceBuffer(),
+                        a_ms_ks_lengths,
+                        a_ms_ks_strides,
+                        b_ns_ks_lengths,
+                        b_ns_ks_strides,
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                        e_ms_ns_lengths,
+                        e_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op);
 
     if(!op.IsSupportedArgument(argument_img2))
     {
@@ -317,7 +319,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
 
     float ave_time_img2 = invoker.Run(argument_img2, StreamConfig{nullptr, time_kernel});
 
-
     ck::index_t M =
         ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
 
@@ -331,9 +332,9 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                             sizeof(DDataType) * M * N + sizeof(EDataType) * M * N * 2;
 
-    float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1 ; 
+    float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1;
 
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_btype / 1.E6 / ave_time;
 
     std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
@@ -343,7 +344,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
     e_device_buf_img.FromDevice(e_ms_ns_device_result_img.mData.data());
 
     auto isRealOk = 0;
-    auto isImgOk = 0;
+    auto isImgOk  = 0;
 
     if(do_verification)
     {
@@ -366,17 +367,16 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
         auto ref_op      = ReferenceOpInstance{};
         auto ref_invoker = ref_op.MakeInvoker();
 
-        auto ref_argument_re =
-            ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op);
+        auto ref_argument_re = ref_op.MakeArgument(
+            a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op);
 
         ref_invoker.Run(ref_argument_re);
 
         alpha = 1.f;
         beta  = 1.f;
-   
+
         cde_element_op = CDEElementOp{alpha, beta};
 
-       
         for(size_t m0 = 0; m0 < e_ms_ns_host_result_re.mDesc.GetLengths()[0]; ++m0)
         {
             for(size_t m1 = 0; m1 < e_ms_ns_host_result_re.mDesc.GetLengths()[1]; ++m1)
@@ -395,11 +395,11 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
 
         alpha = 1.f;
         beta  = -1.f;
-   
+
         cde_element_op = CDEElementOp{alpha, beta};
 
-        auto ref_argument_re1 =
-            ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op);
+        auto ref_argument_re1 = ref_op.MakeArgument(
+            a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op);
 
         ref_invoker.Run(ref_argument_re1);
 
@@ -419,23 +419,20 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
             }
         }
 
-        isRealOk =  ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
-
-        
-
+        isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
 
         // Img Part Verification
         Tensor<CShuffleDataType> c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
         Tensor<CShuffleDataType> c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
 
-        auto ref_argument_img =
-            ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
-            
+        auto ref_argument_img = ref_op.MakeArgument(
+            a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
+
         ref_invoker.Run(ref_argument_img);
 
         alpha = 1.f;
         beta  = 1.f;
-   
+
         cde_element_op = CDEElementOp{alpha, beta};
 
         for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0)
@@ -454,9 +451,9 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
             }
         }
 
-        auto ref_argument_img1 =
-            ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op);
-            
+        auto ref_argument_img1 = ref_op.MakeArgument(
+            a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op);
+
         ref_invoker.Run(ref_argument_img1);
 
         for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0)
@@ -475,7 +472,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
             }
         }
 
-        isImgOk =  ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
+        isImgOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
 
         return (isRealOk && isImgOk);
     }
diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt
index bac5f45cd..feae5f791 100644
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -1,4 +1,21 @@
+set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-add_executable(tile_example_layernorm2d_fwd EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
-target_compile_options(tile_example_layernorm2d_fwd PRIVATE -DSAVE_MEAN_INV_STD)
\ No newline at end of file
+message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
+target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${INSTANCE_SRCS})
+
+set(EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+target_compile_options(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md
index 66b16c1b7..405325a2a 100644
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -6,8 +6,7 @@ This folder contains example for Layernorm2D forward using ck_tile tile-programm
 ```
 # in the root of ck_tile
 mkdir build && cd build
-# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_example_layernorm2d_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_layernorm2d_fwd`
@@ -20,4 +19,4 @@ args:
           -e    epsilon (default:1e-5)
           -v    cpu validation or not (default:1)
        -prec    precision (default:fp16)
-```
\ No newline at end of file
+```
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp
new file mode 100644
index 000000000..f2f51de5d
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_>
+using trait_ = layernorm2d_fwd_traits_<DataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kSaveMeanInvStd_,
+                                       kTwoPass_>;
+
+template <typename data_type>
+float layernorm2d_fwd_b16_(layernorm2d_fwd_traits /*t*/,
+                           layernorm2d_fwd_args a,
+                           const ck_tile::stream_config& s)
+{
+#if 1
+    float r = -1;
+    // clang-format off
+    //                                            rm  rn  tm   tn  vn  pd     mv     2p
+    if(a.n <= 64) {
+            r = layernorm2d_fwd_<trait_<data_type, 1,  1,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 128) {
+        if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type, 1,  1,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type, 1,  2,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 256) {
+        if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 1,  4,  64, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 512) {
+        if (a.n % 8 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 1,  4,  64, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2,  4,  64, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 8,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 768) {
+        if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 3,  4,  64, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 6,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1,12,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 1024) {
+        if (a.n % 8 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 1, 2,  128, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 2,  128, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 2,  128, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 1536) {
+        if (a.n % 8 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 4,   64, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 2,  128, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1,  256, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 6, 1,  256, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 2048) {
+        if (a.n % 8 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 1, 1,  256, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 8, 1,  256, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 3072) {
+        if (a.n % 8 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1,  128, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1,  256, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 6, 1,  256, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1, 1024, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 4096) {
+        if (a.n % 8 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  false, false>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n > 4096) {
+        if (a.n % 8 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  false, true>>(s, a);
+        else if (a.n % 4 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  false, true>>(s, a);
+        else if (a.n % 2 == 0)
+            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  false, true>>(s, a);
+        else
+            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  false, true>>(s, a);
+    }
+    return r;
+#else
+    return layernorm2d_fwd_<trait_<data_type,  1, 1,  1,  256, 4,  true,  false, false>>(s, a);
+#endif
+    // clang-format on
+}
+
+float layernorm2d_fwd(layernorm2d_fwd_traits t,
+                      layernorm2d_fwd_args a,
+                      const ck_tile::stream_config& s)
+{
+
+    float r = -1;
+    if(t.data_type.compare("fp16") == 0)
+    {
+        return layernorm2d_fwd_b16_<ck_tile::fp16_t>(t, a, s);
+    }
+    else if(t.data_type.compare("bf16") == 0)
+    {
+        return layernorm2d_fwd_b16_<ck_tile::bf16_t>(t, a, s);
+    }
+    if(r < 0)
+        throw std::runtime_error("Without supported instances!");
+
+    return r;
+}
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
new file mode 100644
index 000000000..2a20d1e05
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+#if 0
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
+
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
+#endif
+
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
new file mode 100644
index 000000000..d043efc86
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
new file mode 100644
index 000000000..a6ffc8cd2
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp
new file mode 100644
index 000000000..80beeca67
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
new file mode 100644
index 000000000..b362a550a
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
new file mode 100644
index 000000000..9c2d78999
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..c0c75f878
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
new file mode 100644
index 000000000..1bcd0f8a7
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 8,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp
new file mode 100644
index 000000000..6b25fce8c
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 1,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp
new file mode 100644
index 000000000..c4400f0f2
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  3,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  6,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 12,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
new file mode 100644
index 000000000..7f0e4898c
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+#if 0
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
+
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
+#endif
+
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
new file mode 100644
index 000000000..8c3a42cc4
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
new file mode 100644
index 000000000..04d8bc153
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp
new file mode 100644
index 000000000..c32574749
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
new file mode 100644
index 000000000..c71db57a6
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
new file mode 100644
index 000000000..f3ca0932e
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..242f1d2dd
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp
new file mode 100644
index 000000000..e3bfa8e3a
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 8,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp
new file mode 100644
index 000000000..90d960cf0
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 1,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp
new file mode 100644
index 000000000..0960a95c3
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd     mv     2p
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  3,  4,  64, 4,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  6,  4,  64, 2,  true , false, false>>(const S&, A);
+template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 12,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp
new file mode 100644
index 000000000..22895e8ed
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp
@@ -0,0 +1,67 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = layernorm2d_fwd_args;
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_>
+using trait_ = layernorm2d_fwd_traits_<DataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kSaveMeanInvStd_,
+                                       kTwoPass_>;
+
+template <typename Traits_>
+float layernorm2d_fwd_(const S& s, A a)
+{
+    using DataType = typename Traits_::DataType;
+
+    using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
+        typename LayerNormTypeConfig<DataType>::XDataType,
+        typename LayerNormTypeConfig<DataType>::GammaDataType,
+        typename LayerNormTypeConfig<DataType>::BetaDataType,
+        typename LayerNormTypeConfig<DataType>::ComputeDataType,
+        typename LayerNormTypeConfig<DataType>::YDataType,
+        typename LayerNormTypeConfig<DataType>::MeanDataType,
+        typename LayerNormTypeConfig<DataType>::InvStdDataType,
+        typename Traits_::Shape,
+        Traits_::kPadN,
+        Traits_::kSaveMeanInvStd,
+        Traits_::kTwoPass>;
+
+    using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::Layernorm2dFwd<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
index 35f291e06..4f12d9103 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -2,161 +2,120 @@
 #include "layernorm2d_fwd.hpp"
 #include <cstring>
 
-// Host API implementation
-float layernorm2d_fwd(layernorm2d_fwd_traits t,
-                      layernorm2d_fwd_args a,
-                      const ck_tile::stream_config& s)
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
 {
-    if(t.data_type.compare("fp16") == 0)
-    {
-        using XDataType     = ck_tile::half_t;
-        using YDataType     = ck_tile::half_t;
-        using GammaDataType = ck_tile::half_t;
-        using BetaDataType  = ck_tile::half_t;
-#ifdef SAVE_MEAN_INV_STD
-        using MeanDataType   = ck_tile::half_t;
-        using InvStdDataType = ck_tile::half_t;
-#else
-        using MeanDataType   = ck_tile::null_type;
-        using InvStdDataType = ck_tile::null_type;
-#endif
-        using ComputeDataType = float;
-
-        using thread_tile = ck_tile::sequence<4, 4>;
-        using warp_tile   = ck_tile::sequence<8, 128>;
-        using block_tile  = ck_tile::sequence<32, 128>;
-
-        using Shape = ck_tile::TileLayernorm2dShape<thread_tile, warp_tile, block_tile>;
-
-        using PipelineProblem = ck_tile::BlockLayernorm2dFwdProblem<XDataType,
-                                                                    GammaDataType,
-                                                                    BetaDataType,
-                                                                    ComputeDataType,
-                                                                    YDataType,
-                                                                    MeanDataType,
-                                                                    InvStdDataType,
-                                                                    Shape,
-                                                                    true,
-                                                                    true>;
-
-        using Kernel = ck_tile::Layernorm2dFwd<PipelineProblem>;
-
-        auto kargs = Kernel::MakeKargs(
-            a.p_x, a.p_gamma, a.p_beta, a.p_y, a.p_mean, a.p_invStd, a.epsilon, a.M, a.N);
-
-        const dim3 grids      = Kernel::GridSize(a.M);
-        constexpr dim3 blocks = Kernel::BlockSize();
-
-        constexpr ck_tile::index_t kBlockPerCu = Shape::kMWarpPerBlock * Shape::kNWarpPerBlock;
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    }
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
 
-    return 0;
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
 }
 
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("m", "3328", "m dimension")
-        .insert("n", "4096", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
         .insert("e", "1e-5", "epsilon")
+        .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
         .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision");
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
 
-int main(int argc, char* argv[])
+template <typename DataType, bool SaveMeanVar>
+bool run(const ck_tile::ArgParser& arg_parser)
 {
-
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
     float epsilon         = arg_parser.get_float("e");
-    ck_tile::index_t M    = arg_parser.get_int("m");
-    ck_tile::index_t N    = arg_parser.get_int("n");
     std::string data_type = arg_parser.get_str("prec");
+    int kname             = arg_parser.get_int("kname");
     int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
 
-    using XDataType     = ck_tile::half_t;
-    using YDataType     = ck_tile::half_t;
-    using GammaDataType = ck_tile::half_t;
-    using BetaDataType  = ck_tile::half_t;
-#ifdef SAVE_MEAN_INV_STD
-    using MeanDataType   = ck_tile::half_t;
-    using InvStdDataType = ck_tile::half_t;
-#else
-    using MeanDataType = ck_tile::null_type;
-    using InvStdDataType = ck_tile::null_type;
-#endif
-    using ComputeDataType = float;
+    assert(stride >= n);
 
-    // host verify
-    ck_tile::HostTensor<XDataType> x_host({M, N});
-    ck_tile::HostTensor<GammaDataType> gamma_host({N});
-    ck_tile::HostTensor<BetaDataType> beta_host({N});
+    using TypeConfig = LayerNormTypeConfig<DataType>;
+
+    using XDataType     = typename TypeConfig::XDataType;
+    using YDataType     = typename TypeConfig::YDataType;
+    using GammaDataType = typename TypeConfig::GammaDataType;
+    using BetaDataType  = typename TypeConfig::BetaDataType;
+
+    using MeanDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::MeanDataType, ck_tile::null_type>;
+    using InvStdDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::InvStdDataType, ck_tile::null_type>;
 
-    ck_tile::HostTensor<YDataType> y_host_ref({M, N});
-    ck_tile::HostTensor<YDataType> y_host_dev({M, N});
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
 
-    ck_tile::HostTensor<MeanDataType> mean_host_ref({M});
-    ck_tile::HostTensor<InvStdDataType> invStd_host_ref({M});
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+    ck_tile::HostTensor<BetaDataType> beta_host({n});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {stride, 1});
 
-#ifdef SAVE_MEAN_INV_STD
-    ck_tile::HostTensor<MeanDataType> mean_host_dev({M});
-    ck_tile::HostTensor<InvStdDataType> invStd_host_dev({M});
-#endif
+    ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
+    ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
 
-    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
-    ck_tile::FillUniformDistribution<GammaDataType>{-5.f, 5.f}(gamma_host);
-    ck_tile::FillUniformDistribution<BetaDataType>{-5.f, 5.f}(beta_host);
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+    ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
 
     ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
 
-#ifdef SAVE_MEAN_INV_STD
-    ck_tile::DeviceMem mean_buf(mean_host_dev.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem invStd_buf(invStd_host_dev.get_element_space_size_in_bytes());
-#endif
-
     x_buf.ToDevice(x_host.data());
     gamma_buf.ToDevice(gamma_host.data());
     beta_buf.ToDevice(beta_host.data());
 
-    layernorm2d_fwd_traits traits{data_type};
+    std::cout << "[" << data_type << "]"
+              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+
+    layernorm2d_fwd_traits traits{data_type, SaveMeanVar};
 
     layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
                               gamma_buf.GetDeviceBuffer(),
                               beta_buf.GetDeviceBuffer(),
                               y_buf.GetDeviceBuffer(),
-#ifdef SAVE_MEAN_INV_STD
-                              mean_buf.GetDeviceBuffer(),
-                              invStd_buf.GetDeviceBuffer(),
-#else
                               nullptr,
                               nullptr,
-#endif
                               epsilon,
-                              M,
-                              N};
+                              m,
+                              n,
+                              stride};
 
-    float ave_time = layernorm2d_fwd(traits, args, ck_tile::stream_config{nullptr, true});
+    float ave_time = layernorm2d_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
 
-    std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
-                           sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
+    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n +
+                           sizeof(BetaDataType) * n + sizeof(YDataType) * m * n;
 
     float gb_per_sec = num_byte / 1.E6 / ave_time;
-    std::cout << "[" << data_type << "]"
-              << " m:" << M << ", n:" << N << ", " << ave_time << " ms, " << gb_per_sec << " GB/s"
-              << std::flush;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
 
     bool pass = true;
 
@@ -174,20 +133,59 @@ int main(int argc, char* argv[])
 
         y_buf.FromDevice(y_host_dev.data());
 
-        pass = ck_tile::check_err(y_host_dev, y_host_ref);
+        auto [rtol, atol] = get_elimit<DataType>();
+        if(stride == n)
+        {
+            pass = ck_tile::check_err(
+                y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+        }
+        else
+        {
+            for(int i_r = 0; i_r < m; i_r++)
+            {
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * stride,
+                                                      y_host_dev.begin() + i_r * stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * stride,
+                                                      y_host_ref.begin() + i_r * stride + n);
+                pass &= ck_tile::check_err(y_host_dev_row,
+                                           y_host_ref_row,
+                                           std::string("OUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
 
-#ifdef SAVE_MEAN_INV_STD
-        mean_buf.FromDevice(mean_host_dev.data());
-        pass &= ck_tile::check_err(mean_host_dev, mean_host_ref);
+    return pass;
+}
 
-        invStd_buf.FromDevice(invStd_host_dev.data());
-        pass &= ck_tile::check_err(invStd_host_dev, invStd_host_ref);
-#endif
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
 
-        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+    const std::string data_type = arg_parser.get_str("prec");
+    int save_mv                 = arg_parser.get_int("save_mv");
+    if(data_type == "fp16" && save_mv)
+    {
+        return run<ck_tile::half_t, true>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp16" && !save_mv)
+    {
+        return run<ck_tile::half_t, false>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16" && save_mv)
+    {
+        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16" && !save_mv)
+    {
+        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
     }
 
-    std::cout << std::endl << std::flush;
-
-    return !pass;
+    return -3;
 }
diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
index 4d1aac099..861e4a023 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
@@ -8,23 +8,114 @@
 #include "ck_tile/ops/layernorm2d.hpp"
 #include <string>
 
-struct layernorm2d_fwd_traits
+template <typename DataType>
+struct LayerNormTypeConfig;
+
+template <>
+struct LayerNormTypeConfig<ck_tile::half_t>
 {
-    std::string data_type;
+    using XDataType       = ck_tile::half_t;
+    using YDataType       = ck_tile::half_t;
+    using GammaDataType   = ck_tile::half_t;
+    using BetaDataType    = ck_tile::half_t;
+    using MeanDataType    = ck_tile::half_t;
+    using InvStdDataType  = ck_tile::half_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct LayerNormTypeConfig<ck_tile::bf16_t>
+{
+    using XDataType       = ck_tile::bf16_t;
+    using YDataType       = ck_tile::bf16_t;
+    using GammaDataType   = ck_tile::bf16_t;
+    using BetaDataType    = ck_tile::bf16_t;
+    using MeanDataType    = ck_tile::bf16_t;
+    using InvStdDataType  = ck_tile::bf16_t;
+    using ComputeDataType = float;
+};
+
+// runtime args
+struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_>
+struct layernorm2d_fwd_traits_
+{
+    using DataType = ck_tile::remove_cvref_t<DataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (warpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / warpSize);
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % warpSize == 0);
+            return ThreadPerBlock_N_ / warpSize;
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Layernorm2dShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN           = kPadN_;
+    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kTwoPass        = kTwoPass_;
 };
 
-struct layernorm2d_fwd_args
+template <typename Traits_>
+float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a);
+
+// This is the public API, will be generated by script
+struct layernorm2d_fwd_traits
 {
-    const void* p_x;
-    const void* p_gamma;
-    const void* p_beta;
-    void* p_y;
-    void* p_mean;
-    void* p_invStd;
-    float epsilon;
-    ck_tile::index_t M;
-    ck_tile::index_t N;
+    std::string data_type;
+    bool save_mean_var;
 };
 
-// host API
 float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/02_layernorm2d/script/perf_test.sh b/example/ck_tile/02_layernorm2d/script/perf_test.sh
new file mode 100755
index 000000000..bfb7f9ffe
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/script/perf_test.sh
@@ -0,0 +1,38 @@
+
+# run from top of ck folder
+EXE=build/bin/tile_example_layernorm2d_fwd
+
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec=fp16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
\ No newline at end of file
diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
new file mode 100755
index 000000000..dcd40fda4
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# call from top of CK folder
+EXE=./build/bin/tile_example_layernorm2d_fwd
+
+for pr_i in "fp16" "bf16" ; do
+$EXE -prec=$pr_i -m=99  -n=13
+$EXE -prec=$pr_i -m=17  -n=16
+$EXE -prec=$pr_i -m=1   -n=100
+$EXE -prec=$pr_i -m=4   -n=128
+$EXE -prec=$pr_i -m=80  -n=127
+$EXE -prec=$pr_i -m=22  -n=255 -stride=256
+$EXE -prec=$pr_i -m=7   -n=599
+$EXE -prec=$pr_i -m=19  -n=512
+$EXE -prec=$pr_i -m=33  -n=313 -stride=1000
+$EXE -prec=$pr_i -m=11  -n=510
+$EXE -prec=$pr_i -m=171 -n=676 -stride=818
+$EXE -prec=$pr_i -m=91  -n=636
+$EXE -prec=$pr_i -m=12  -n=768 -stride=800
+$EXE -prec=$pr_i -m=100 -n=766 -stride=812
+$EXE -prec=$pr_i -m=31  -n=1024
+$EXE -prec=$pr_i -m=64  -n=1000 -stride=1004
+$EXE -prec=$pr_i -m=8   -n=1501
+$EXE -prec=$pr_i -m=3   -n=1826
+$EXE -prec=$pr_i -m=5   -n=2040
+$EXE -prec=$pr_i -m=7   -n=2734
+$EXE -prec=$pr_i -m=1   -n=3182
+$EXE -prec=$pr_i -m=9   -n=4096
+$EXE -prec=$pr_i -m=3   -n=8192
+$EXE -prec=$pr_i -m=1   -n=10547
+$EXE -prec=$pr_i -m=3   -n=17134
+done
diff --git a/example/ck_tile/05_reduce/CMakeLists.txt b/example/ck_tile/05_reduce/CMakeLists.txt
new file mode 100644
index 000000000..6caa38d50
--- /dev/null
+++ b/example/ck_tile/05_reduce/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(EXAMPLE_REDUCE "tile_example_reduce")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding example ${EXAMPLE_REDUCE}")
+
+add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
+target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+set(EXAMPLE_REDUCE_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_REDUCE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
\ No newline at end of file
diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp
new file mode 100644
index 000000000..7973a8dfd
--- /dev/null
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -0,0 +1,110 @@
+#include "ck_tile/host.hpp"
+#include "reduce.hpp"
+#include <cstring>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    using ADataType   = DataType;
+    using AccDataType = float;
+    using BDataType   = DataType;
+
+    ck_tile::index_t m = arg_parser.get_int("m");
+    ck_tile::index_t n = arg_parser.get_int("n");
+    int do_validation  = arg_parser.get_int("v");
+    int warmup         = arg_parser.get_int("warmup");
+    int repeat         = arg_parser.get_int("repeat");
+
+    ck_tile::HostTensor<ADataType> a_host({m, n});
+    ck_tile::HostTensor<BDataType> b_host_ref({m});
+    ck_tile::HostTensor<BDataType> b_host_dev({m});
+
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
+
+    ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_buf(b_host_dev.get_element_space_size_in_bytes());
+
+    a_buf.ToDevice(a_host.data());
+
+    using BlockWarps = ck_tile::sequence<4, 1>;
+    using BlockTile  = ck_tile::sequence<128, 128>;
+    using WarpTile   = ck_tile::sequence<32, 128>;
+    using ThreadTile = ck_tile::sequence<8, 8>;
+
+    constexpr ck_tile::index_t kBlockSize  = 256;
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    ck_tile::index_t kGridSize             = (m / BlockTile::at(ck_tile::number<0>{}));
+    std::cout << "grid size " << kGridSize << std::endl;
+
+    using Kernel = ck_tile::Reduce<ADataType,
+                                   AccDataType,
+                                   BDataType,
+                                   kBlockSize,
+                                   BlockWarps,
+                                   BlockTile,
+                                   WarpTile,
+                                   ThreadTile>;
+
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
+                                       static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
+                                       m,
+                                       n));
+
+    std::size_t num_btype = sizeof(ADataType) * m * n + sizeof(BDataType) * m;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        ck_tile::reference_reduce<ADataType, AccDataType, BDataType>(a_host, b_host_ref);
+        b_buf.FromDevice(b_host_dev.mData.data());
+        pass = ck_tile::check_err(b_host_dev, b_host_ref);
+
+        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    if(data_type == "bf16")
+    {
+        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+    }
+}
diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp
new file mode 100644
index 000000000..e36b46895
--- /dev/null
+++ b/example/ck_tile/05_reduce/reduce.hpp
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+template <typename ADataType,
+          typename AccDataType,
+          typename BDataType,
+          index_t kBlockSize,
+          typename BlockWarps, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WarpTile,   // warp size, seq<M, N>
+          typename ThreadTile> // contiguous pixels(vector size) along seq<M, N>
+struct Reduce
+{
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
+
+    static constexpr index_t Thread_M = ThreadTile::at(number<0>{});
+    static constexpr index_t Thread_N = ThreadTile::at(number<1>{});
+
+    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Thread_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Thread_N;
+
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    __device__ static constexpr auto MakeABlockTileDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Repeat_M, WarpPerBlock_M, ThreadPerWarp_M, Thread_M>,
+                      sequence<Repeat_N, WarpPerBlock_N, ThreadPerWarp_N, Thread_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    __device__ void operator()(const ADataType* p_a, BDataType* p_b, index_t M, index_t N) const
+    {
+        const auto a_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_a, make_tuple(M, N), make_tuple(N, 1), number<Thread_N>{}, number<1>{});
+
+        const auto iM = get_block_id() * Block_M;
+
+        // A window
+        auto a_block_window = make_tile_window(a_m_n,
+                                               make_tuple(number<Block_M>{}, number<Block_N>{}),
+                                               {iM, 0},
+                                               MakeABlockTileDistribution());
+
+        const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
+
+        const ADataType reduce_init_value = 0;
+
+        constexpr auto reduce_dims = sequence<1>{};
+
+        // Acc tile
+        // TODO: support cross warp reduction
+        auto acc_block_tensor = decltype(block_tile_reduce<AccDataType>(
+            load_tile(a_block_window), reduce_dims, f_reduce, reduce_init_value)){};
+
+        // init Acc tile
+        tile_elementwise_inout(
+            [&](auto& acc) { acc = type_convert<AccDataType>(reduce_init_value); },
+            acc_block_tensor);
+
+        // loop
+        index_t iN = 0;
+
+        do
+        {
+            const auto a_block_tensor = load_tile(a_block_window);
+
+            // FIXME: support cross warp reduction
+            block_tile_reduce(acc_block_tensor, a_block_tensor, reduce_dims, f_reduce);
+
+            move_tile_window(a_block_window, {0, Block_N});
+
+            iN += Block_N;
+
+        } while(iN < N);
+
+        // FIXME: support cross warp reduction
+        block_tile_reduce_sync(acc_block_tensor, f_reduce);
+
+        // convert acc_block_tensor to b_block_tensor
+        const auto b_block_tensor = tile_elementwise_in(
+            [](const auto& acc) { return type_convert<BDataType>(acc); }, acc_block_tensor);
+
+        // B
+        const auto b_m = make_naive_tensor_view_packed<address_space_enum::global>(
+            p_b, make_tuple(M), number<32>{});
+
+        // B window
+        auto b_block_window = make_tile_window(b_m, make_tuple(number<Block_M>{}), {iM});
+
+        // store B tile
+        store_tile(b_block_window, b_block_tensor);
+    }
+};
+
+} // namespace ck_tile
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index fe1e9c9ed..ec4a175d3 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(01_fmha)
 add_subdirectory(02_layernorm2d)
 add_subdirectory(03_gemm)
 add_subdirectory(04_img2col)
+add_subdirectory(05_reduce)
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 4cddf6faa..d96f14710 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -52,6 +52,7 @@
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
 #include "ck_tile/core/utility/philox_rand.hpp"
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
index 42508e66a..a88780459 100644
--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -59,4 +59,47 @@ CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
 #endif
 }
 
+template <typename T>
+CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
+{
+#if 0
+    return  __shfl(v_local, src_lane);
+#elif 1
+    if constexpr(sizeof(int32_t) > sizeof(T))
+    {
+        union packet
+        {
+            int32_t x;
+            T v;
+        };
+        packet p;
+        p.v = v_local;
+        packet p_remote;
+        p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
+
+        return p_remote.v;
+    }
+    else if constexpr(sizeof(int32_t) == sizeof(T))
+    {
+        const int32_t v_remote_tmp =
+            __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
+
+        return bit_cast<T>(v_remote_tmp);
+    }
+    else
+    {
+        static_assert(sizeof(T) % sizeof(int32_t) == 0, "wrong!");
+        constexpr index_t elm = sizeof(T) / sizeof(int32_t);
+        using vector_type     = thread_buffer<int32_t, elm>;
+        auto vs               = bit_cast<vector_type>(v_local);
+        auto vs_remote        = vector_type{};
+        static_for<0, elm, 1>{}([&](auto i_e) {
+            int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
+            vs_remote(i_e) = tmp;
+        });
+        return bit_cast<T>(vs_remote);
+    }
+#endif
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index a8bc27cdf..580faae92 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -32,11 +32,13 @@
 #define CK_TILE_DEVICE inline __device__
 #define CK_TILE_HOST_DEVICE inline __host__ __device__
 #define CK_TILE_DEVICE_EXTERN __device__
+#define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__
 #else
 #define CK_TILE_HOST inline
 #define CK_TILE_DEVICE inline
 #define CK_TILE_HOST_DEVICE inline
 #define CK_TILE_DEVICE_EXTERN
+#define CK_TILE_HOST_DEVICE_EXTERN
 #endif
 
 #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index acf187cfc..4fcea9642 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -1111,4 +1111,126 @@ CK_TILE_HOST_DEVICE constexpr auto generate_array(F&& f, number<N>)
                   typename arithmetic_sequence_gen<0, N, 1>::type{});
 }
 
+namespace impl {
+template <typename, typename, typename, index_t>
+struct reverse_slice_sequence_impl;
+
+template <index_t x,
+          index_t... xs,
+          index_t m,
+          index_t... ms,
+          index_t id,
+          index_t... ids,
+          index_t SliceSize>
+struct reverse_slice_sequence_impl<sequence<x, xs...>,
+                                   sequence<m, ms...>,
+                                   sequence<id, ids...>,
+                                   SliceSize>
+{
+    using old_scan =
+        reverse_slice_sequence_impl<sequence<xs...>, sequence<ms...>, sequence<ids...>, SliceSize>;
+
+    static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value;
+    static constexpr auto slice_length =
+        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
+
+    using dim_lengths =
+        typename sequence_merge<sequence<slice_length>, typename old_scan::dim_lengths>::type;
+    using dim_slices =
+        typename sequence_merge<sequence<x / slice_length>, typename old_scan::dim_slices>::type;
+    using remaining_slice_sizes = typename sequence_merge<
+        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>,
+        typename old_scan::remaining_slice_sizes>::type;
+
+    // the first idx that sliced length not equal to original length
+    static constexpr index_t _flag =
+        slice_length != x && remaining_slice_sizes{}.front().value == 1;
+    static constexpr index_t _split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
+    static constexpr index_t _split_idx =
+        std::conditional_t<_split_flag, number<id>, number<0>>::value;
+
+    static constexpr index_t split_flag = _split_flag || old_scan::split_flag;
+    static constexpr index_t split_idx  = std::
+        conditional_t<old_scan::split_flag, number<old_scan::split_idx>, number<_split_idx>>::value;
+};
+
+template <index_t x, index_t m, index_t id, index_t SliceSize>
+struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, SliceSize>
+{
+    static constexpr auto slice_size = SliceSize;
+    static constexpr auto slice_length =
+        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
+
+    using dim_lengths = sequence<slice_length>;
+    using dim_slices  = sequence<x / slice_length>;
+    using remaining_slice_sizes =
+        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>;
+
+    // the first idx that sliced length not equal to original length
+    static constexpr index_t _flag =
+        slice_length != x && remaining_slice_sizes{}.front().value == 1;
+    static constexpr index_t split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
+    static constexpr index_t split_idx =
+        std::conditional_t<split_flag, number<id>, number<0>>::value;
+};
+} // namespace impl
+
+// clang-format off
+// input a sequence(with optional mask), and the SliceSize : size per slice
+// output the sequence each slice, and number of slices
+//
+// e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
+//      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
+//      <4, 2, 4, 1, 6>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2
+//      <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices  , slice_idx: 1
+//
+//      <4, 2, 8>, 64       -> lengths:<4, 2, 8>       , nums: <1, 1, 1>       : 1  slices , slice_idx: 0
+//      <4, 2, 8>, 32       -> lengths:<2, 2, 8>       , nums: <2, 1, 1>       : 2  slices , slice_idx: 0
+//      <4, 2, 8>, 16       -> lengths:<1, 2, 8>       , nums: <4, 1, 1>       : 4  slices , slice_idx: 0
+//      <4, 2, 8>, 8        -> lengths:<1, 1, 8>       , nums: <4, 2, 1>       : 8  slices , slice_idx: 1
+//      <4, 2, 8>, 4        -> lengths:<1, 1, 4>       , nums: <4, 2, 2>       : 16 slices , slice_idx: 2
+//      <4, 2, 8>, 2        -> lengths:<1, 1, 2>       , nums: <4, 2, 4>       : 32 slices , slice_idx: 2
+//      <4, 2, 8>, 1        -> lengths:<1, 1, 1>       , nums: <4, 2, 8>       : 64 slices , slice_idx: 2
+//
+//      <4, 2, 1, 4, 2> / 4 ->
+// mask:<1, 1, 1, 0, 1>,    -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices  , slice_idx: 0
+//
+// return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
+// have split slices (right -> left)
+//  or the first index that sliced length is different from the original length
+// clang-format on
+template <typename Seq,
+          index_t SliceSize,
+          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
+constexpr auto reverse_slice_sequence(Seq,
+                                      number<SliceSize>,
+                                      Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
+{
+    static_assert(Seq::size() == Mask::size());
+    using sliced_type =
+        impl::reverse_slice_sequence_impl<Seq,
+                                          Mask,
+                                          typename arithmetic_sequence_gen<0, Seq::size(), 1>::type,
+                                          SliceSize>;
+    static_assert(sliced_type::remaining_slice_sizes::front().value == 1,
+                  "can not evenly divide this sequence, please check");
+    return make_tuple(typename sliced_type::dim_lengths{},
+                      typename sliced_type::dim_slices{},
+                      number<sliced_type::split_idx>{});
+}
+
+template <typename Seq,
+          index_t SliceSize,
+          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
+constexpr auto slice_sequence(Seq,
+                              number<SliceSize>,
+                              Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
+{
+    constexpr auto r =
+        reverse_slice_sequence(Seq{}.reverse(), number<SliceSize>{}, Mask{}.reverse());
+    return make_tuple(r[number<0>{}].reverse(),
+                      r[number<1>{}].reverse(),
+                      number<Seq::size() - r[number<2>{}] - 1>{});
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index cb8c2c70c..598dfeea3 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -488,6 +488,26 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X& x, const Y& y,
         f, x, y, z, typename arithmetic_sequence_gen<0, X::size(), 1>::type{});
 }
 
+namespace detail {
+
+template <typename F, typename X, index_t... Is>
+CK_TILE_HOST_DEVICE constexpr auto embed_tuples_impl(F f, const X& x, sequence<Is...>)
+{
+    return concat_tuple(f(x.at(number<Is>{}))...);
+}
+
+} // namespace detail
+
+// make sure F return at least a tuple
+// e.g. x : tuple<X, Y>, f will return tuple<Z, W>
+// this function will return
+template <typename F, typename X>
+CK_TILE_HOST_DEVICE constexpr auto embed_tuples(F f, const X& x)
+{
+    return detail::embed_tuples_impl(
+        f, x, typename arithmetic_sequence_gen<0, X::size(), 1>::type{});
+}
+
 // By default unroll to the flatten
 template <index_t Depth = 0, index_t MaxDepth = -1>
 CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple(const tuple<>& t)
diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
index 299a74bc0..29c20bed0 100644
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -187,4 +187,18 @@ set_tile_if(static_distributed_tensor<DataType, StaticTileDistribution>& out_ten
     });
 }
 
+// this function used inside span loop over
+template <typename YLengths, index_t XUnpacks>
+CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number<XUnpacks>)
+{
+    constexpr auto y_size  = reduce_on_sequence(YLengths{}, multiplies{}, number<1>{});
+    constexpr auto y_packs = number<XUnpacks>{};
+    static_assert(y_size % y_packs == 0);
+    constexpr auto y_slice_size = y_size / y_packs;
+
+    constexpr auto slice_info = slice_sequence(YLengths{}, number<y_slice_size>{});
+    constexpr auto unpacks    = slice_info[number<1>{}];
+    return unpacks;
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/sweep_tile.hpp b/include/ck_tile/core/tensor/sweep_tile.hpp
index f1511f11d..f82f6b5bc 100644
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
@@ -27,4 +28,281 @@ CK_TILE_DEVICE void sweep_tile_span(TileDistributedSpan_, const F& f)
     });
 }
 
+// unpacked span, this version support span with unpack(multi-arg) functor
+//
+template <
+    typename TileDistributedSpan_, // tile_distributed_span<...>
+    typename F,                    // signature: F(tile_distributed_index<...>)
+    typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
+CK_TILE_DEVICE void sweep_tile_uspan(TileDistributedSpan_, const F& f, Unpacks = {})
+{
+    using DstrSpan = remove_cvref_t<TileDistributedSpan_>;
+
+    static_uford<typename DstrSpan::Impl, Unpacks>{}(
+        [&](auto... dstr_idx_impl) { f(detail::make_tile_distributed_index(dstr_idx_impl)...); });
+}
+
+namespace impl {
+
+template <typename, typename, typename>
+struct sweep_tile_impl;
+
+template <typename DistributedTensor, typename UnpacksPerXDim, index_t I, index_t... Is>
+struct sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<I, Is...>>
+{
+    CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const
+    {
+        constexpr auto spans     = DistributedTensor::get_distributed_spans();
+        constexpr auto y_lengths = typename decltype(spans[number<I>{}])::Impl{};
+        constexpr auto x_unpacks = number<UnpacksPerXDim{}.at(number<I>{})>{};
+        constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks);
+        return y_unpacks;
+    }
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        return u.get_num_of_access() *
+               sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                   .get_num_of_access();
+    }
+    template <typename F, typename SpanIdx>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+
+        sweep_tile_uspan(
+            spans[number<I>{}],
+            [&](auto... i_idx) {
+                const auto next_span_idx = embed_tuples(
+                    [&](auto si) { return make_tuple(concat_tuple(si, make_tuple(i_idx))...); },
+                    span_idx);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx);
+            },
+            get_y_unpacks());
+    }
+    template <typename F, typename SpanIdx, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(const F& f, const SpanIdx& span_idx, number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        constexpr auto access_stride =
+            sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                .get_num_of_access();
+        constexpr auto curr_i_access = number<i_access / access_stride>{};
+        constexpr auto next_i_access = number<i_access % access_stride>{};
+        u(
+            [&](auto... i_idx) {
+                const auto next_span_idx = embed_tuples(
+                    [&](auto si) {
+                        return make_tuple(concat_tuple(
+                            si, make_tuple(detail::make_tile_distributed_index(i_idx)))...);
+                    },
+                    span_idx);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx, next_i_access);
+            },
+            curr_i_access);
+    }
+};
+
+template <typename DistributedTensor, typename UnpacksPerXDim>
+struct sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<>>
+{
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const { return 1; }
+    template <typename F, typename SpanIdx>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const
+    {
+        unpack(f, span_idx);
+    }
+    template <typename F, typename SpanIdx, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(const F& f, const SpanIdx& span_idx, number<i_access>) const
+    {
+        unpack(f, span_idx);
+    }
+};
+
+template <typename, typename, typename>
+struct sweep_tile_impl_0;
+
+// TODO: support empty tuple to remove this "entry-point" like function
+template <typename DistributedTensor, typename UnpacksPerXDim, index_t I, index_t... Is>
+struct sweep_tile_impl_0<DistributedTensor, UnpacksPerXDim, sequence<I, Is...>>
+{
+    CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const
+    {
+        constexpr auto spans     = DistributedTensor::get_distributed_spans();
+        constexpr auto y_lengths = typename decltype(spans[number<I>{}])::Impl{};
+        constexpr auto x_unpacks = number<UnpacksPerXDim{}.at(number<I>{})>{};
+        constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks);
+        return y_unpacks;
+    }
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        return u.get_num_of_access() *
+               sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                   .get_num_of_access();
+    }
+    template <typename F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        sweep_tile_uspan(
+            spans[number<I>{}],
+            [&](auto... i_idx) {
+                constexpr auto next_span_idx = make_tuple(make_tuple(i_idx)...);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx);
+            },
+            get_y_unpacks());
+    }
+    template <typename F, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        constexpr auto access_stride =
+            sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                .get_num_of_access();
+        constexpr auto curr_i_access = number<i_access / access_stride>{};
+        constexpr auto next_i_access = number<i_access % access_stride>{};
+        u(
+            [&](auto... i_idx) {
+                constexpr auto next_span_idx =
+                    make_tuple(make_tuple(detail::make_tile_distributed_index(i_idx))...);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx, next_i_access);
+            },
+            curr_i_access);
+    }
+};
+
+} // namespace impl
+
+/*
+ * Enhanced sweep-tile utility, can control unpacks along each X-dim
+ * the lambda function argument is the distributed-idx, which can directly
+ * plugged into the distributed tensor as setter/getter
+ *
+ * e.g. below function, y with the type DistributedTensor, r is row scale
+ *
+ * // sweep tile 1 by 1
+ * sweep_tile<DistributedTensor>([&](auto idx) {
+ *     constexpr auto row_id = make_tuple(idx[number<0>{}]);
+ *     y(idx)                = y(idx) * r(row_id);
+ * });
+ *
+ * // sweep tile with 2 pixel from last dim each function call
+ * sweep_tile<DistributedTensor>(
+ *     [&](auto idx_0, auto idx_1) {
+ *         constexpr auto row_id = make_tuple(idx_0[number<0>{}]);
+ *         y(idx_0)              = y(idx_0) * r(row_id);
+ *         y(idx_1)              = y(idx_1) * r(row_id);
+ *     },
+ *     sequence<1, 2>{});
+ *
+ * // sweep tile with 2x2 pixel each function call
+ * sweep_tile<DistributedTensor>(
+ *     [&](auto idx_00, auto idx_01, auto idx_10, auto idx_11) {
+ *         constexpr auto row_id0 = make_tuple(idx_00[number<0>{}]);
+ *         constexpr auto row_id1 = make_tuple(idx_10[number<0>{}]);
+ *         y(idx_00)              = y(idx_00) * r(row_id0);
+ *         y(idx_01)              = y(idx_01) * r(row_id0);
+ *         y(idx_10)              = y(idx_10) * r(row_id1);
+ *         y(idx_11)              = y(idx_11) * r(row_id1);
+ *     },
+ *     sequence<2, 2>{});
+ *
+ * TODO: do we need constexpr? lambda function could be non-constexpr
+ */
+template <typename DistributedTensor,
+          typename F,
+          typename UnpacksPerXDim =
+              typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE constexpr void sweep_tile(const F& f, UnpacksPerXDim = {})
+{
+    constexpr auto spans = DistributedTensor::get_distributed_spans();
+
+    impl::sweep_tile_impl_0<DistributedTensor,
+                            UnpacksPerXDim,
+                            typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{}(f);
+}
+
+template <typename DistributedTensor,
+          typename F,
+          typename UnpacksPerXDim =
+              typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE constexpr void
+sweep_tile(const DistributedTensor&, const F& f, UnpacksPerXDim = {})
+{
+    sweep_tile<DistributedTensor, F, UnpacksPerXDim>(f, UnpacksPerXDim{});
+}
+
+/*
+ * construct a sweep tile instance, which support issue the lambda one by one
+ * Note that this struct will hold the lambda functor, but will not hold the distributed tensor
+ * the functionality is the same as sweep_tile()
+ */
+template <typename DistributedTensor_,
+          typename F_,
+          typename UnpacksPerXDim_ =
+              typename uniform_sequence_gen<DistributedTensor_::get_num_of_dimension(), 1>::type>
+struct tile_sweeper
+{
+    using DistributedTensor = remove_cvref_t<DistributedTensor_>;
+    using F                 = remove_cvref_t<F_>;
+    using UnpacksPerXDim    = remove_cvref_t<UnpacksPerXDim_>;
+
+    CK_TILE_HOST_DEVICE tile_sweeper(const F& f_, UnpacksPerXDim = {}) : f(f_) {}
+    CK_TILE_HOST_DEVICE tile_sweeper(const DistributedTensor&, const F& f_, UnpacksPerXDim = {})
+        : f(f_)
+    {
+    }
+    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access()
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto tmp =
+            impl::sweep_tile_impl_0<DistributedTensor,
+                                    UnpacksPerXDim,
+                                    typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{};
+        return tmp.get_num_of_access();
+    }
+
+    CK_TILE_HOST_DEVICE void operator()() const
+    {
+        sweep_tile<DistributedTensor>(f, UnpacksPerXDim{});
+    }
+
+    template <index_t i_access>
+    CK_TILE_HOST_DEVICE void operator()(number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+
+        impl::sweep_tile_impl_0<DistributedTensor,
+                                UnpacksPerXDim,
+                                typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{}(
+            f, number<i_access>{});
+    }
+    F f;
+};
+
+// partial deduction is not allowed
+// template <typename T, typename F, typename U>
+// CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;
+
+// deduction guide
+template <typename T,
+          typename F,
+          typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const T&, const F&, U = {})->tile_sweeper<T, F, U>;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index 24c932f0a..7761be492 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -17,6 +17,14 @@
 
 namespace ck_tile {
 
+namespace detail {
+template <typename Distribution>
+CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
+{
+    return Distribution::_get_partition_index();
+}
+} // namespace detail
+
 // distributed span
 template <index_t... PartialHsLengths>
 struct tile_distributed_span
@@ -83,6 +91,21 @@ struct tile_distribution
     CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_p() { return NDimP; }
     CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_r() { return NDimR; }
 
+    CK_TILE_HOST_DEVICE static auto _get_partition_index()
+    {
+        // only support warp-tile and block-tile
+        static_assert(NDimP == 1 or NDimP == 2, "wrong!");
+
+        if constexpr(NDimP == 1)
+        {
+            return array<index_t, 1>{get_lane_id()};
+        }
+        else if constexpr(NDimP == 2)
+        {
+            return array<index_t, 2>{get_warp_id(), get_lane_id()};
+        }
+    }
+
     CK_TILE_HOST_DEVICE static constexpr auto get_lengths()
     {
 #if 0
@@ -149,6 +172,16 @@ struct tile_distribution
     }
 #endif
 
+    template <typename PartitionIndex = decltype(_get_partition_index())>
+    CK_TILE_HOST_DEVICE auto
+    calculate_index(const PartitionIndex& ps_idx = _get_partition_index()) const
+    {
+        const auto ps_ys_idx = container_concat(ps_idx, array<index_t, NDimY>{0});
+        const auto window_adaptor_thread_coord_tmp =
+            make_tensor_adaptor_coordinate(ps_ys_to_xs_, ps_ys_idx);
+        return window_adaptor_thread_coord_tmp.get_bottom_index();
+    }
+
     CK_TILE_HOST_DEVICE static constexpr auto get_distributed_spans()
     {
         constexpr auto distributed_spans_impl = DstrEncode::detail::distributed_spans_lengthss_;
@@ -421,6 +454,7 @@ struct tile_distribution_detail
 
 } // namespace detail
 
+#if 0
 // this returns a constexpr tile_distribution
 template <typename StaticTileDistributionEncoding_>
 CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_)
@@ -457,6 +491,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistribution
         detail::tile_distribution_detail<remove_cvref_t<decltype(rh_major_minor_to_hidden_ids)>>>{
         ps_ys_to_xs_adaptor, ys_to_d_descriptor};
 }
+#endif
 
 // this returns a static tile_distribution
 template <typename StaticTileDistributionEncoding_>
@@ -499,129 +534,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistr
 //***********************************************************************************
 
 namespace detail {
-
-template <typename Distribution>
-CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
-{
-    // only support warp-tile and block-tile
-    static_assert(Distribution::NDimP == 1 or Distribution::NDimP == 2, "wrong!");
-
-    if constexpr(Distribution::NDimP == 1)
-    {
-        return array<index_t, 1>{get_lane_id()};
-    }
-    else if constexpr(Distribution::NDimP == 2)
-    {
-        return array<index_t, 2>{get_warp_id(), get_lane_id()};
-    }
-}
-
-template <typename, typename, typename, index_t>
-struct reverse_slice_sequence_impl;
-
-template <index_t x,
-          index_t... xs,
-          index_t m,
-          index_t... ms,
-          index_t id,
-          index_t... ids,
-          index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x, xs...>,
-                                   sequence<m, ms...>,
-                                   sequence<id, ids...>,
-                                   SliceSize>
-{
-    using old_scan =
-        reverse_slice_sequence_impl<sequence<xs...>, sequence<ms...>, sequence<ids...>, SliceSize>;
-
-    static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-
-    using dim_lengths =
-        typename sequence_merge<sequence<slice_length>, typename old_scan::dim_lengths>::type;
-    using dim_slices =
-        typename sequence_merge<sequence<x / slice_length>, typename old_scan::dim_slices>::type;
-    using remaining_slice_sizes = typename sequence_merge<
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>,
-        typename old_scan::remaining_slice_sizes>::type;
-
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t _split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t _split_idx =
-        std::conditional_t<_split_flag, number<id>, number<0>>::value;
-
-    static constexpr index_t split_flag = _split_flag || old_scan::split_flag;
-    static constexpr index_t split_idx  = std::
-        conditional_t<old_scan::split_flag, number<old_scan::split_idx>, number<_split_idx>>::value;
-};
-
-template <index_t x, index_t m, index_t id, index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, SliceSize>
-{
-    static constexpr auto slice_size = SliceSize;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-
-    using dim_lengths = sequence<slice_length>;
-    using dim_slices  = sequence<x / slice_length>;
-    using remaining_slice_sizes =
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>;
-
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t split_idx =
-        std::conditional_t<split_flag, number<id>, number<0>>::value;
-};
-
-// clang-format off
-// input a sequence(with optional mask), and the SliceSize : size per slice
-// output the sequence each slice, and number of slices
-//
-// e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
-//      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
-//      <4, 2, 4, 1, 6>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2
-//      <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices  , slice_idx: 1
-//
-//      <4, 2, 8>, 64       -> lengths:<4, 2, 8>       , nums: <1, 1, 1>       : 1  slices , slice_idx: 0
-//      <4, 2, 8>, 32       -> lengths:<2, 2, 8>       , nums: <2, 1, 1>       : 2  slices , slice_idx: 0
-//      <4, 2, 8>, 16       -> lengths:<1, 2, 8>       , nums: <4, 1, 1>       : 4  slices , slice_idx: 0
-//      <4, 2, 8>, 8        -> lengths:<1, 1, 8>       , nums: <4, 2, 1>       : 8  slices , slice_idx: 1
-//      <4, 2, 8>, 4        -> lengths:<1, 1, 4>       , nums: <4, 2, 2>       : 16 slices , slice_idx: 2
-//      <4, 2, 8>, 2        -> lengths:<1, 1, 2>       , nums: <4, 2, 4>       : 32 slices , slice_idx: 2
-//      <4, 2, 8>, 1        -> lengths:<1, 1, 1>       , nums: <4, 2, 8>       : 64 slices , slice_idx: 2
-//
-//      <4, 2, 1, 4, 2> / 4 ->
-// mask:<1, 1, 1, 0, 1>,    -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices  , slice_idx: 0
-//
-// return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
-// have split slices (right -> left)
-//  or the first index that sliced length is different from the original length
-// clang-format on
-template <typename Seq,
-          index_t SliceSize,
-          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
-constexpr auto reverse_slice_sequence(Seq,
-                                      number<SliceSize>,
-                                      Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
-{
-    static_assert(Seq::size() == Mask::size());
-    using sliced_type =
-        reverse_slice_sequence_impl<Seq,
-                                    Mask,
-                                    typename arithmetic_sequence_gen<0, Seq::size(), 1>::type,
-                                    SliceSize>;
-    static_assert(sliced_type::remaining_slice_sizes::front().value == 1,
-                  "can not evenly divide this sequence, please check");
-    return make_tuple(typename sliced_type::dim_lengths{},
-                      typename sliced_type::dim_slices{},
-                      number<sliced_type::split_idx>{});
-}
-
 //
 // slice tensor from x_dim, result in split in y_dim, not p_dim.
 // We don't support slice cross p_dim (aka, slice different threads)
diff --git a/include/ck_tile/core/utility/functional_with_tuple.hpp b/include/ck_tile/core/utility/functional_with_tuple.hpp
new file mode 100644
index 000000000..4b4040319
--- /dev/null
+++ b/include/ck_tile/core/utility/functional_with_tuple.hpp
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+// This file should not be included inside tuple.hpp!
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include <stdint.h>
+#include <utility>
+
+namespace ck_tile {
+
+namespace detail {
+
+// RemainLengths: sequence<...>
+// Orders: sequence<...>
+template <class RemainLengths, class RamainUnpacks, class Orders>
+struct static_uford_impl
+{
+    CK_TILE_HOST_DEVICE constexpr static_uford_impl()
+    {
+        static_assert(RemainLengths::size() > 0, "wrong! should not get here");
+        static_assert(RamainUnpacks::size() > 0, "wrong! should not get here");
+    }
+
+    template <class F, class CurrentUnpackIds>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds) const
+    {
+        constexpr index_t pack_len = RamainUnpacks::front();
+        static_for<0, RemainLengths::front(), pack_len>{}([=](auto I) {
+            constexpr auto new_pack = generate_tuple(
+                [&](auto idx_) {
+                    constexpr auto i_new_pack = number<I + idx_ % pack_len>{};
+                    constexpr auto i_pre_pack = number<idx_ / pack_len>{};
+                    return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack);
+                },
+                number<CurrentUnpackIds::size() * pack_len>{});
+
+            static_uford_impl<decltype(RemainLengths::pop_front()),
+                              decltype(RamainUnpacks::pop_front()),
+                              Orders>{}(f, new_pack);
+        });
+    }
+};
+
+template <class Orders>
+struct static_uford_impl<sequence<>, sequence<>, Orders>
+{
+    template <class F, class PackedId>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId) const
+    {
+        constexpr auto origin_packs = transform_tuples(
+            [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{});
+        unpack(f, origin_packs);
+    }
+};
+
+template <class RemainLengths, class RamainUnpacks, class Orders>
+struct static_uford_one_shot_impl
+{
+    template <class F, class CurrentUnpackIds, index_t current_acc>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds, number<current_acc>) const
+    {
+        constexpr auto r_lens_stride =
+            reverse_exclusive_scan_sequence(RemainLengths{}, multiplies{}, number<1>{});
+        constexpr auto r_upks_stride =
+            reverse_exclusive_scan_sequence(RamainUnpacks{}, multiplies{}, number<1>{});
+
+        constexpr index_t current_stride = r_lens_stride.front() / r_upks_stride.front();
+        constexpr index_t pack_len       = RamainUnpacks::front();
+        constexpr index_t current_idx    = (current_acc / current_stride) * pack_len;
+
+        constexpr auto new_pack = generate_tuple(
+            [&](auto idx_) {
+                constexpr auto i_new_pack = number<current_idx + idx_ % pack_len>{};
+                constexpr auto i_pre_pack = number<idx_ / pack_len>{};
+                return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack);
+            },
+            number<CurrentUnpackIds::size() * pack_len>{});
+
+        static_uford_one_shot_impl<decltype(RemainLengths::pop_front()),
+                                   decltype(RamainUnpacks::pop_front()),
+                                   Orders>{}(f, new_pack, number<current_acc % current_stride>{});
+    }
+};
+
+template <class Orders>
+struct static_uford_one_shot_impl<sequence<>, sequence<>, Orders>
+{
+    template <class F, class PackedId, index_t current_acc>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId, number<current_acc>) const
+    {
+        constexpr auto origin_packs = transform_tuples(
+            [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{});
+        unpack(f, origin_packs);
+    }
+};
+
+} // namespace detail
+
+// TODO: we may unify static_ford/static_uford in the future
+//
+// loop over nd space(sequence) with packs
+// you must make sure the function passed in has same number of argument
+//
+// e.g.
+// Lengths=seq<2, 3, 4>, Unpacks=<1, 1, 2>
+// static_uford<Lengths, Unpacks>{}([&](auto i_0, auto i_1){}); // require 2 args(packs)
+//
+// loop #0, i_0=seq<0, 0, 0>, i_1=<0, 0, 1>
+// loop #1, i_0=seq<0, 0, 2>, i_1=<0, 0, 3>
+// loop #2, i_0=seq<0, 1, 0>, i_1=<0, 1, 1>
+// loop #3, i_0=seq<0, 1, 2>, i_1=<0, 1, 3>
+// loop #4, i_0=seq<0, 2, 0>, i_1=<0, 2, 1>
+// loop #5, i_0=seq<0, 2, 2>, i_1=<0, 2, 3>
+// loop #6, i_0=seq<1, 0, 0>, i_1=<1, 0, 1>
+// ...
+template <class Lengths,
+          class Unpacks = typename uniform_sequence_gen<Lengths::size(), 1>::type,
+          class Orders  = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
+struct static_uford
+{
+    static constexpr index_t num_packs = reduce_on_sequence(Unpacks{}, multiplies{}, number<1>{});
+
+    CK_TILE_HOST_DEVICE constexpr static_uford()
+    {
+        static_assert(Lengths::size() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::size() == Unpacks::size(), "wrong! inconsistent size");
+        static_assert(Lengths::size() == Orders::size(), "wrong! inconsistent size");
+        static_for<0, Lengths::size(), 1>{}(
+            [&](auto i) { static_assert(Lengths{}.at(i) % Unpacks{}.at(i) == 0); });
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access()
+    {
+        using L_ = decltype(Lengths{} / Unpacks{});
+
+        return reduce_on_sequence(L_{}, multiplies{}, number<1>{});
+    }
+
+    // F signature: F(sequence<...> multi_id...)
+    // multi_id is the unordered multi-index
+    template <class F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
+    {
+        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
+        constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{});
+        detail::static_uford_impl<decltype(ordered_lengths), decltype(ordered_unpacks), Orders>{}(
+            f, make_tuple(sequence<>{}));
+    }
+
+    // this version is friendly for issue function one by one
+    template <class F, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, number<i_access>) const
+    {
+        static_assert(i_access < get_num_of_access());
+        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
+        constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{});
+        detail::static_uford_one_shot_impl<decltype(ordered_lengths),
+                                           decltype(ordered_unpacks),
+                                           Orders>{}(
+            f, make_tuple(sequence<>{}), number<i_access>{});
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index b382710b1..dbc1f5d23 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -21,7 +21,7 @@
 #include "ck_tile/host/reference/reference_batched_softmax.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
-#include "ck_tile/host/reference/reference_layernorm2d.hpp"
+#include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
 #include "ck_tile/host/reference/reference_reduce.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
 #include "ck_tile/host/stream_config.hpp"
diff --git a/include/ck_tile/host/reference/reference_layernorm2d.hpp b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
similarity index 100%
rename from include/ck_tile/host/reference/reference_layernorm2d.hpp
rename to include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
diff --git a/include/ck_tile/ops/layernorm2d.hpp b/include/ck_tile/ops/layernorm2d.hpp
index 3b66645ed..2a403b0f4 100644
--- a/include/ck_tile/ops/layernorm2d.hpp
+++ b/include/ck_tile/ops/layernorm2d.hpp
@@ -4,6 +4,9 @@
 #pragma once
 
 #include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp"
-#include "ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp"
-#include "ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp"
+#include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
index 468df793d..cebe5131a 100644
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -5,37 +5,57 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
-#include "ck_tile/ops/welford/thread/thread_welford.hpp"
-#include "ck_tile/ops/welford/warp/warp_welford.hpp"
 
 namespace ck_tile {
 
-// TODO: Extract some type to wrapper class
-template <typename Problem_>
-struct Layernorm2dFwd
+// host side args
+struct Layernorm2dFwdHostArgs
 {
-    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    const void* p_x;
+    const void* p_gamma;
+    const void* p_beta;
 
-    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
-    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
-    using BetaDataType    = ck_tile::remove_cvref_t<typename Problem::BetaDataType>;
-    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
-    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
-    using MeanDataType    = ck_tile::remove_cvref_t<typename Problem::MeanDataType>;
-    using InvStdDataType  = ck_tile::remove_cvref_t<typename Problem::InvStdDataType>;
+    void* p_y;
+    void* p_mean;
+    void* p_invStd;
 
-    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
-    static constexpr bool kHasBeta    = !std::is_same_v<BetaDataType, ck_tile::null_type>;
-    static constexpr bool kSaveMean   = !std::is_same_v<MeanDataType, ck_tile::null_type>;
-    static constexpr bool kSaveInvStd = !std::is_same_v<InvStdDataType, ck_tile::null_type>;
+    float epsilon;
 
-    static constexpr ck_tile::index_t kMPerBlock = Problem::BlockShape::kMPerBlock;
-    static constexpr ck_tile::index_t kNPerBlock = Problem::BlockShape::kNPerBlock;
-    static constexpr bool kPadM                  = Problem::kPadM;
-    static constexpr bool kPadN                  = Problem::kPadN;
+    index_t m;
+    index_t n;
+    index_t stride; // row_stride
+};
 
-    static constexpr ck_tile::index_t kNThreadPerWarp = Problem::BlockShape::kNThreadPerWarp;
-    static constexpr ck_tile::index_t kNPerThread     = Problem::BlockShape::kNPerThread;
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct Layernorm2dFwd
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using GammaDataType   = remove_cvref_t<typename Problem::GammaDataType>;
+    using BetaDataType    = remove_cvref_t<typename Problem::BetaDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = remove_cvref_t<typename Problem::YDataType>;
+    using MeanDataType    = remove_cvref_t<typename Problem::MeanDataType>;
+    using InvStdDataType  = remove_cvref_t<typename Problem::InvStdDataType>;
+
+    static constexpr bool kHasGamma       = !std::is_same_v<GammaDataType, null_type>;
+    static constexpr bool kHasBeta        = !std::is_same_v<BetaDataType, null_type>;
+    static constexpr bool kSaveMeanInvStd = Problem::kSaveMeanInvStd;
+    static constexpr bool kSaveMean       = Problem::kSaveMeanInvStd;
+    static constexpr bool kSaveInvStd     = Problem::kSaveMeanInvStd;
+
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kTwoPass   = Problem::kTwoPass;
+
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
@@ -52,400 +72,177 @@ struct Layernorm2dFwd
 
         float epsilon;
 
-        ck_tile::index_t M;
-        ck_tile::index_t N;
+        index_t m;
+        index_t n;
+        index_t stride; // row_stride
     };
+    using Hargs = Layernorm2dFwdHostArgs;
 
-    CK_TILE_HOST static constexpr Kargs MakeKargs(const void* p_x,
-                                                  const void* p_gamma,
-                                                  const void* p_beta,
-                                                  void* p_y,
-                                                  void* p_mean,
-                                                  void* p_invStd,
-                                                  float epsilon,
-                                                  ck_tile::index_t M,
-                                                  ck_tile::index_t N)
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
     {
-        return Kargs{p_x, p_gamma, p_beta, p_y, p_mean, p_invStd, epsilon, M, N};
+        return Kargs{hargs.p_x,
+                     hargs.p_gamma,
+                     hargs.p_beta,
+                     hargs.p_y,
+                     hargs.p_mean,
+                     hargs.p_invStd,
+                     hargs.epsilon,
+                     hargs.m,
+                     hargs.n,
+                     hargs.stride};
     }
 
-    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t M) { return M / kMPerBlock; }
-
-    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::kBlockSize; }
-
-    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
     {
-        using S = typename Problem::BlockShape;
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<S::kMWarpPerBlock, S::kMThreadPerWarp, S::kMPerThread>,
-                      sequence<S::kNWarpPerBlock, S::kNThreadPerWarp, S::kNPerThread>>,
-                tuple<sequence<1, 2>, sequence<1, 2>>,
-                tuple<sequence<0, 0>, sequence<1, 1>>,
-                sequence<1, 2>,
-                sequence<2, 2>>{});
+        return (hargs.m + Block_M - 1) / Block_M;
     }
 
-    CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution()
-    {
-        using S = typename Problem::BlockShape;
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<
-                sequence<S::kMWarpPerBlock, S::kMThreadPerWarp>,
-                tuple<sequence<S::kNWarpPerBlock, S::kNThreadPerWarp, S::kNPerThread>>,
-                tuple<sequence<0, 1>, sequence<0, 1>>,
-                tuple<sequence<0, 0>, sequence<1, 1>>,
-                sequence<1>,
-                sequence<2>>{});
-    }
-
-    CK_TILE_DEVICE static int GetWelfordMaxCount(int N)
-    {
-        constexpr ck_tile::index_t kNThreadPerBlock = kNPerBlock / kNPerThread;
-
-        int thread_id_n = get_thread_id() % kNThreadPerBlock;
-        int max_count =
-            __builtin_amdgcn_readfirstlane(N < kNPerBlock ? 0 : kNPerThread * (N / kNPerBlock));
-        int n_per_block_tail_loop =
-            __builtin_amdgcn_readfirstlane(N - max_count * kNThreadPerBlock);
-
-        if(n_per_block_tail_loop > 0)
-        {
-            int thread_max_n = (thread_id_n + 1) * kNPerThread;
-            int delta        = thread_max_n - n_per_block_tail_loop;
-            delta            = clamp(thread_max_n - n_per_block_tail_loop, 0, kNPerThread);
-            max_count += kNPerThread - delta;
-        }
-
-        return max_count;
-    }
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
 
-    template <typename DistributedTensor>
-    CK_TILE_DEVICE static auto InvSqrt(const DistributedTensor& in_dstr_tensor,
-                                       const ComputeDataType epsilon)
-    {
-        // TODO: Investigate fast inverse square root algorithm with epsilon
-        constexpr auto spans = DistributedTensor::get_distributed_spans();
-
-        DistributedTensor out_dstr_tensor;
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
 
-        sweep_tile_span(spans[number<0>{}], [&](auto idx0) {
-            constexpr auto i_idx   = make_tuple(idx0);
-            out_dstr_tensor(i_idx) = type_convert<ComputeDataType>(1.0f) /
-                                     ck_tile::sqrt(in_dstr_tensor[i_idx] + epsilon);
-        });
-
-        return out_dstr_tensor;
-    }
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
 
-    template <typename XBlockWindow,
-              typename GammaBlockWindow,
-              typename BetaBlockWindow,
-              typename YBlockWindow,
-              typename MeanBlockWindow,
-              typename InvStdBlockWindow,
-              bool Cond = (kHasGamma && kHasBeta)>
-    CK_TILE_DEVICE std::enable_if_t<Cond>
-    TwoPassLayernorm2dFwd(XBlockWindow& x_block_window,
-                          GammaBlockWindow& gamma_block_window,
-                          BetaBlockWindow& beta_block_window,
-                          YBlockWindow& y_block_window,
-                          MeanBlockWindow& mean_block_window,
-                          InvStdBlockWindow& inv_std_block_window,
-                          ComputeDataType epsilon,
-                          ck_tile::index_t N) const
+    CK_TILE_HOST static std::string GetName()
     {
-        // TODO - Optimize tail loop to reduce move_tile_window()
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, kNPerBlock));
-
-        int welford_max_count = GetWelfordMaxCount(N);
-        ThreadWelford<ComputeDataType, XDataType> thread_welford{welford_max_count};
-
-        using XTensorType = decltype(load_tile(x_block_window));
-        auto mean_compute_block_tensor =
-            thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
-        auto var_compute_block_tensor =
-            thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
-
-        clear_tile(mean_compute_block_tensor);
-        clear_tile(var_compute_block_tensor);
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x_block_tensor = load_tile(x_block_window);
-
-            thread_welford(x_block_tensor, mean_compute_block_tensor, var_compute_block_tensor);
-            move_tile_window(x_block_window, {0, kNPerBlock});
-        }
-
-        // TODO: support cross warp Welford
-        WarpMergeWelford<ComputeDataType, true>{}(
-            mean_compute_block_tensor, var_compute_block_tensor, thread_welford.cur_count_);
-
-        auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon);
-
-        if constexpr(kSaveMean)
-            store_tile(mean_block_window, cast_tile<MeanDataType>(mean_compute_block_tensor));
-        if constexpr(kSaveInvStd)
-            store_tile(inv_std_block_window,
-                       cast_tile<InvStdDataType>(inv_std_compute_block_tensor));
-
-        // reverse read x to reuse cache
-        ck_tile::index_t stride_to_right_most_window =
-            N % kNPerBlock == 0 ? N - kNPerBlock : N - N % kNPerBlock;
-
-        move_tile_window(x_block_window, {0, -kNPerBlock});
-        move_tile_window(gamma_block_window, {stride_to_right_most_window});
-        move_tile_window(beta_block_window, {stride_to_right_most_window});
-        move_tile_window(y_block_window, {0, stride_to_right_most_window});
-
-        // Normalization
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x_block_tensor     = load_tile(x_block_window);
-            const auto gamma_block_tensor = load_tile(gamma_block_window);
-            const auto beta_block_tensor  = load_tile(beta_block_window);
-
-            constexpr auto x_spans = decltype(x_block_tensor)::get_distributed_spans();
-
-            auto y_block_tensor =
-                make_static_distributed_tensor<YDataType>(x_block_tensor.get_tile_distribution());
-
-            sweep_tile_span(x_spans[I1], [&](auto idx1) {
-                constexpr auto j_idx = make_tuple(idx1);
-                const auto gamma     = type_convert<ComputeDataType>(gamma_block_tensor[j_idx]);
-                const auto beta      = type_convert<ComputeDataType>(beta_block_tensor[j_idx]);
-
-                sweep_tile_span(x_spans[I0], [&](auto idx0) {
-                    constexpr auto i_idx   = make_tuple(idx0);
-                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
-
-                    const auto mean    = mean_compute_block_tensor[i_idx];
-                    const auto inv_std = inv_std_compute_block_tensor[i_idx];
-
-                    const auto x = type_convert<ComputeDataType>(x_block_tensor[i_j_idx]);
-                    auto y       = (x - mean) * inv_std * gamma + beta;
-
-                    y_block_tensor(i_j_idx) = type_convert<YDataType>(y);
-                });
-            });
-
-            store_tile(y_block_window, y_block_tensor);
-
-            move_tile_window(x_block_window, {0, -kNPerBlock});
-            move_tile_window(gamma_block_window, {-kNPerBlock});
-            move_tile_window(beta_block_window, {-kNPerBlock});
-            move_tile_window(y_block_window, {0, -kNPerBlock});
-        }
-    }
-
-    template <typename XBlockWindow,
-              typename GammaBlockWindow,
-              typename BetaBlockWindow,
-              typename YBlockWindow,
-              typename MeanBlockWindow,
-              typename InvStdBlockWindow,
-              bool Cond = (kHasGamma && kHasBeta)>
-    CK_TILE_DEVICE std::enable_if_t<Cond>
-    OnePassLayernorm2dFwd(XBlockWindow& x_block_window,
-                          GammaBlockWindow& gamma_block_window,
-                          BetaBlockWindow& beta_block_window,
-                          YBlockWindow& y_block_window,
-                          MeanBlockWindow& mean_block_window,
-                          InvStdBlockWindow& inv_std_block_window,
-                          ComputeDataType epsilon,
-                          ck_tile::index_t N) const
-    {
-        int welford_max_count = GetWelfordMaxCount(N);
-        ThreadWelford<ComputeDataType, XDataType> thread_welford{welford_max_count};
-
-        using XTensorType = decltype(load_tile(x_block_window));
-        auto mean_compute_block_tensor =
-            thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
-        auto var_compute_block_tensor =
-            thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
-
-        clear_tile(mean_compute_block_tensor);
-        clear_tile(var_compute_block_tensor);
-
-        const auto x_block_tensor = load_tile(x_block_window);
-        thread_welford(x_block_tensor, mean_compute_block_tensor, var_compute_block_tensor);
-        // TODO: support cross warp Welford
-        WarpMergeWelford<ComputeDataType, true>{}(
-            mean_compute_block_tensor, var_compute_block_tensor, thread_welford.cur_count_);
-
-        auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon);
-
-        if constexpr(kSaveMean)
-            store_tile(mean_block_window, cast_tile<MeanDataType>(mean_compute_block_tensor));
-        if constexpr(kSaveInvStd)
-            store_tile(inv_std_block_window,
-                       cast_tile<InvStdDataType>(inv_std_compute_block_tensor));
-
-        // normalize
-        const auto gamma_block_tensor = load_tile(gamma_block_window);
-        const auto beta_block_tensor  = load_tile(beta_block_window);
-
-        constexpr auto x_spans = decltype(x_block_tensor)::get_distributed_spans();
-
-        auto y_block_tensor =
-            make_static_distributed_tensor<YDataType>(x_block_tensor.get_tile_distribution());
-
-        sweep_tile_span(x_spans[I1], [&](auto idx1) {
-            constexpr auto j_idx = make_tuple(idx1);
-            const auto gamma     = type_convert<ComputeDataType>(gamma_block_tensor[j_idx]);
-            const auto beta      = type_convert<ComputeDataType>(beta_block_tensor[j_idx]);
-
-            sweep_tile_span(x_spans[I0], [&](auto idx0) {
-                constexpr auto i_idx   = make_tuple(idx0);
-                constexpr auto i_j_idx = make_tuple(idx0, idx1);
-
-                const auto mean    = mean_compute_block_tensor[i_idx];
-                const auto inv_std = inv_std_compute_block_tensor[i_idx];
-
-                const auto x = type_convert<ComputeDataType>(x_block_tensor[i_j_idx]);
-                auto y       = (x - mean) * inv_std * gamma + beta;
-
-                y_block_tensor(i_j_idx) = type_convert<YDataType>(y);
-            });
-        });
-
-        store_tile(y_block_window, y_block_tensor);
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kSaveMeanInvStd) n += "_mv";
+            if (kTwoPass) n += "_2p";
+            return n; }();
+
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("layernorm2d_fwd_") + _SS_(t2s<XDataType>::name) + "_" + 
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        const auto x_m_n = [&]() {
-            const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+        const auto iM = get_block_id() * Block_M;
+
+        const auto x_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<const XDataType*>(kargs.p_x),
-                make_tuple(kargs.M, kargs.N),
-                make_tuple(kargs.N, 1),
-                number<kNPerThread>{},
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
                 number<1>{});
 
-            return pad_tensor_view(x_dram_naive,
-                                   make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-                                   sequence<kPadM, kPadN>{});
+            // NOTE: we don't do any pad in this kernel for loading, assume that inside kernel will
+            // check the max count dynamically
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<false, false>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
         }();
 
-        const auto gamma_n = [&]() {
-            const auto gamma_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+        const auto gamma_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<const GammaDataType*>(kargs.p_gamma),
-                make_tuple(kargs.N),
+                make_tuple(kargs.n),
                 make_tuple(1),
-                number<kNPerThread>{},
+                number<Vector_N>{},
                 number<1>{});
 
-            return pad_tensor_view(
-                gamma_dram_naive, make_tuple(number<kNPerBlock>{}), sequence<kPadN>{});
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<false>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
         }();
 
-        const auto beta_n = [&]() {
-            const auto gamma_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+        const auto beta_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<const BetaDataType*>(kargs.p_beta),
-                make_tuple(kargs.N),
+                make_tuple(kargs.n),
                 make_tuple(1),
-                number<kNPerThread>{},
+                number<Vector_N>{},
                 number<1>{});
 
-            return pad_tensor_view(
-                gamma_dram_naive, make_tuple(number<kNPerBlock>{}), sequence<kPadN>{});
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<false>{});
+            return make_tile_window(tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {0});
         }();
 
-        const auto iM = get_block_id() * kMPerBlock;
-
-        constexpr auto xDstr = MakeXBlockTileDistribution();
-
-        auto x_block_window = make_tile_window(
-            x_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0}, xDstr);
-
-        const auto y_m_n = [&]() {
-            const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+        auto y_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<YDataType*>(kargs.p_y),
-                make_tuple(kargs.M, kargs.N),
-                make_tuple(kargs.N, 1),
-                number<kNPerThread>{},
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
                 number<1>{});
 
-            return pad_tensor_view(y_dram_naive,
-                                   make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-                                   sequence<kPadM, kPadN>{});
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
         }();
 
-        auto y_block_window = make_tile_window(
-            y_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0});
-
-        constexpr auto gammaDstr = MakeGammaBetaBlockTileDistribution();
-        constexpr auto betaDstr  = gammaDstr;
-
-        auto gamma_block_window =
-            make_tile_window(gamma_n, make_tuple(number<kNPerBlock>{}), {0}, gammaDstr);
-
-        auto beta_block_window = make_tile_window(
-            beta_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {0}, betaDstr);
-
-        auto mean_block_window = [&]() {
+        auto mean_window = [&]() {
             if constexpr(kSaveMean)
             {
                 const auto mean_m = [&]() {
                     const auto mean_dram_naive =
                         make_naive_tensor_view_packed<address_space_enum::global>(
                             static_cast<MeanDataType*>(kargs.p_mean),
-                            make_tuple(kargs.M),
+                            make_tuple(kargs.m),
                             number<1>{});
 
                     return pad_tensor_view(
-                        mean_dram_naive, make_tuple(number<kMPerBlock>{}), sequence<kPadM>{});
+                        mean_dram_naive, make_tuple(number<Block_M>{}), sequence<kPadM>{});
                 }();
-
-                return make_tile_window(mean_m, make_tuple(number<kMPerBlock>{}), {iM});
+                return make_tile_window(mean_m, make_tuple(number<Block_M>{}), {iM});
             }
             else
-                return make_null_tile_window(make_tuple(number<kMPerBlock>{}));
+                return make_null_tile_window(make_tuple(number<Block_M>{}));
         }();
 
-        auto inv_std_block_window = [&]() {
+        auto inv_std_window = [&]() {
             if constexpr(kSaveInvStd)
             {
                 const auto inv_std_m = [&]() {
                     const auto inv_std_dram_naive =
                         make_naive_tensor_view_packed<address_space_enum::global>(
                             static_cast<InvStdDataType*>(kargs.p_invStd),
-                            make_tuple(kargs.M),
+                            make_tuple(kargs.m),
                             number<1>{});
 
                     return pad_tensor_view(
-                        inv_std_dram_naive, make_tuple(number<kMPerBlock>{}), sequence<kPadM>{});
+                        inv_std_dram_naive, make_tuple(number<Block_M>{}), sequence<kPadM>{});
                 }();
-
-                return make_tile_window(inv_std_m, make_tuple(number<kMPerBlock>{}), {iM});
+                return make_tile_window(inv_std_m, make_tuple(number<Block_M>{}), {iM});
             }
             else
-                return make_null_tile_window(make_tuple(number<kMPerBlock>{}));
+                return make_null_tile_window(make_tuple(number<Block_M>{}));
         }();
 
-        if(kargs.N <= kNPerBlock)
-            OnePassLayernorm2dFwd(x_block_window,
-                                  gamma_block_window,
-                                  beta_block_window,
-                                  y_block_window,
-                                  mean_block_window,
-                                  inv_std_block_window,
-                                  static_cast<const ComputeDataType>(kargs.epsilon),
-                                  kargs.N);
-        else
-            TwoPassLayernorm2dFwd(x_block_window,
-                                  gamma_block_window,
-                                  beta_block_window,
-                                  y_block_window,
-                                  mean_block_window,
-                                  inv_std_block_window,
-                                  static_cast<const ComputeDataType>(kargs.epsilon),
-                                  kargs.N);
+        __shared__ char smem[GetSmemSize()];
+
+        Pipeline{}(x_window,
+                   gamma_window,
+                   beta_window,
+                   y_window,
+                   mean_window,
+                   inv_std_window,
+                   static_cast<const ComputeDataType>(kargs.epsilon),
+                   kargs.n,
+                   smem);
     }
 };
 
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp
new file mode 100644
index 000000000..e4b60331e
--- /dev/null
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+/*
+// clang-format off
+
+4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
+
+                         Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
+        +<----------------------< Repeat_N(2)>--------------------->+
+        |                                                           |
+        +<--    <WarpPerBlock_N(2)>  -->+
+            Warp_N
+        +--------------+--------------+--------------+--------------+----+----------------+
+ Warp_M | wrap_0       | wrap_1       |                             |    ^                ^
+        +--------------+--------------+                             |   <WarpPerBlock_M(2)> |
+        | wrap_2       | wrap_3       |                             |    v
+        +--------------+--------------+--------------+--------------+----+           Block_M
+        |                             |                             |
+        +                             +                             |
+        |                             |                             |                     v
+        +--------------+--------------+--------------+--------------+                     +
+
+        each Warp-tile (e.g 16 thrd per row)
+
+         Vector_N (contiguous pixels each thrd holds along N, or vector size)
+        +-----------+-----------+-----------+-----------+-----------+
+        | thrd_0    | thrd_1    | thrd_2    | thrd_3    | ...         Vector_M
+        +-----------+-----------+-----------+-----------+-----------+
+        | thrd_16   | thrd_17   | thrd_18   | thrd_19   | ...
+        +-----------+-----------+-----------+-----------+-----------+
+// clang-format on
+*/
+template <typename BlockTile_,    // block size, seq<M, N>
+          typename WarpPerBlock_, // num warps along seq<M, N>
+          typename WarpTile_,     // warp size, seq<M, N>
+          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
+          index_t BlockSize_ =
+              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
+struct Layernorm2dShape
+{
+    // block size
+    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
+
+    // num warps along seq<M, N>, within each block
+    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
+
+    // warp size
+    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
+
+    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
+    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
+    // repeat of each thread along seq<M, N>
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    // vector size along seq<M, N>
+    static constexpr index_t Vector_M = Vector_::at(number<0>{});
+    static constexpr index_t Vector_N = Vector_::at(number<1>{});
+
+    static_assert(Warp_M % Vector_M == 0);
+    static_assert(Warp_N % Vector_N == 0);
+    // num of threads along seq<M, N>, within each warp
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+
+    static constexpr index_t BlockSize = BlockSize_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
deleted file mode 100644
index 707a38f62..000000000
--- a/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core/utility/type_traits.hpp"
-
-namespace ck_tile {
-
-template <typename XDataType_,
-          typename GammaDataType_,
-          typename BetaDataType_,
-          typename ComputeDataType_,
-          typename YDataType_,
-          typename MeanDataType_,
-          typename InvStdDataType_,
-          typename BlockShape_,
-          bool kPadM_,
-          bool kPadN_>
-struct BlockLayernorm2dFwdProblem
-{
-    using XDataType             = remove_cvref_t<XDataType_>;
-    using GammaDataType         = remove_cvref_t<GammaDataType_>;
-    using BetaDataType          = remove_cvref_t<BetaDataType_>;
-    using ComputeDataType       = remove_cvref_t<ComputeDataType_>;
-    using YDataType             = remove_cvref_t<YDataType_>;
-    using MeanDataType          = remove_cvref_t<MeanDataType_>;
-    using InvStdDataType        = remove_cvref_t<InvStdDataType_>;
-    using BlockShape            = remove_cvref_t<BlockShape_>;
-    static constexpr bool kPadM = kPadM_;
-    static constexpr bool kPadN = kPadN_;
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
new file mode 100644
index 000000000..6661cddf4
--- /dev/null
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
+#include "ck_tile/ops/welford/block/block_welford.hpp"
+
+namespace ck_tile {
+
+struct Layernorm2dFwdPipelineDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<0, 1>, sequence<1, 2>>,
+                sequence<1, 1>,
+                sequence<0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelford()
+    {
+        using P_ = BlockWelfordProblem<typename Problem::XDataType,
+                                       typename Problem::ComputeDataType,
+                                       typename Problem::BlockShape>;
+
+        return BlockWelford<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordSync()
+    {
+        using P_ = BlockWelfordProblem<typename Problem::XDataType,
+                                       typename Problem::ComputeDataType,
+                                       typename Problem::BlockShape>;
+
+        return BlockWelfordSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordCrossWarpSync()
+    {
+        using P_ = BlockWelfordProblem<typename Problem::XDataType,
+                                       typename Problem::ComputeDataType,
+                                       typename Problem::BlockShape>;
+
+        return BlockWelfordCrossWarpSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockWelfordProblem<typename Problem::XDataType,
+                                           typename Problem::ComputeDataType,
+                                           typename Problem::BlockShape>;
+
+            using block_welford = BlockWelford<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::XDataType>(
+                    MakeXBlockTileDistribution<Problem>()));
+            using mean_var_block_tile =
+                decltype(block_welford::template MakeMeanVarBlockTile<x_block_tile>());
+
+            return GetBlockWelfordCrossWarpSync<Problem>()
+                .template GetSmemSize<mean_var_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
new file mode 100644
index 000000000..d73bcb29e
--- /dev/null
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = Layernorm2dFwdPipelineDefaultPolicy>
+struct Layernorm2dFwdPipelineOnePass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
+    using BetaDataType    = ck_tile::remove_cvref_t<typename Problem::BetaDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+    using MeanDataType    = ck_tile::remove_cvref_t<typename Problem::MeanDataType>;
+    using InvStdDataType  = ck_tile::remove_cvref_t<typename Problem::InvStdDataType>;
+
+    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kHasBeta    = !std::is_same_v<BetaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveMean   = Problem::kSaveMeanInvStd;
+    static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr"; // block per row
+        else
+            return "wpr"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow,
+              typename GammaWindow,
+              typename BetaWindow,
+              typename YWindow,
+              typename MeanWindow,
+              typename InvStdWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const GammaWindow& gamma_window_,
+                                   const BetaWindow& beta_window_,
+                                   YWindow& y_window,
+                                   MeanWindow& mean_window,
+                                   InvStdWindow& inv_std_window,
+                                   ComputeDataType epsilon,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        const auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        const auto gamma_window = make_tile_window(
+            gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
+        const auto beta_window = make_tile_window(
+            beta_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
+
+        const auto x  = load_tile(x_window);
+        int cur_count = 0;
+        int max_count =
+            block_tile_welford_calculate_max_count<typename Problem::BlockShape>(row_size);
+        auto block_welford      = Policy::template GetBlockWelford<Problem>();
+        auto block_welford_sync = Policy::template GetBlockWelfordSync<Problem>();
+        auto block_welford_cross_warp_sync =
+            Policy::template GetBlockWelfordCrossWarpSync<Problem>();
+
+        // load gamma/beta (TODO: support no gamma/beta?)
+        const auto gamma = load_tile(gamma_window);
+        const auto beta  = load_tile(beta_window);
+
+        // compute welford each-thread->cross-lane->cross-warp
+        auto [mean, var] = block_welford(x, cur_count, max_count);
+        block_welford_sync(mean, var, cur_count);
+        block_welford_cross_warp_sync(mean, var, cur_count, smem);
+        block_tile_welford_post_scale_var(var, cur_count);
+
+        // compute inv-std
+        auto inv_std = tile_elementwise_in(
+            [&](const auto& v_) {
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_) + epsilon);
+            },
+            var);
+
+        if constexpr(kSaveMean)
+            store_tile(mean_window, cast_tile<MeanDataType>(mean));
+        if constexpr(kSaveInvStd)
+            store_tile(inv_std_window, cast_tile<InvStdDataType>(inv_std));
+
+        // layernorm computation
+        auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
+        sweep_tile(y, [&, mean_ = mean](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+            const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+            const auto beta_  = type_convert<ComputeDataType>(beta[j_idx]);
+
+            const auto x_ = type_convert<ComputeDataType>(x[idx]);
+            auto y_       = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
+
+            y(idx) = type_convert<YDataType>(y_);
+        });
+        store_tile(y_window, y);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
new file mode 100644
index 000000000..8e9f8e81e
--- /dev/null
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType_,
+          typename GammaDataType_,
+          typename BetaDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename MeanDataType_,
+          typename InvStdDataType_,
+          typename BlockShape_,
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_>
+struct Layernorm2dFwdPipelineProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using GammaDataType   = remove_cvref_t<GammaDataType_>;
+    using BetaDataType    = remove_cvref_t<BetaDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YDataType       = remove_cvref_t<YDataType_>;
+    using MeanDataType    = remove_cvref_t<MeanDataType_>;
+    using InvStdDataType  = remove_cvref_t<InvStdDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+
+    static constexpr bool kPadN           = kPadN_;
+    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kTwoPass        = kTwoPass_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
new file mode 100644
index 000000000..dcbfc87da
--- /dev/null
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = Layernorm2dFwdPipelineDefaultPolicy>
+struct Layernorm2dFwdPipelineTwoPass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
+    using BetaDataType    = ck_tile::remove_cvref_t<typename Problem::BetaDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+    using MeanDataType    = ck_tile::remove_cvref_t<typename Problem::MeanDataType>;
+    using InvStdDataType  = ck_tile::remove_cvref_t<typename Problem::InvStdDataType>;
+
+    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kHasBeta    = !std::is_same_v<BetaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveMean   = Problem::kSaveMeanInvStd;
+    static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr"; // block per row
+        else
+            return "wpr"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow,
+              typename GammaWindow,
+              typename BetaWindow,
+              typename YWindow,
+              typename MeanWindow,
+              typename InvStdWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const GammaWindow& gamma_window_,
+                                   const BetaWindow& beta_window_,
+                                   YWindow& y_window,
+                                   MeanWindow& mean_window,
+                                   InvStdWindow& inv_std_window,
+                                   ComputeDataType epsilon,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto gamma_window = make_tile_window(
+            gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
+        auto beta_window = make_tile_window(
+            beta_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
+
+        // Problem::BlockShape
+        static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+
+        // total number of count assume current iter have no pad(only last iter has pad)
+        constexpr index_t count_per_iter =
+            Problem::BlockShape::Repeat_N * Problem::BlockShape::Vector_N;
+        const index_t last_iter_n = row_size - (num_n_tile_iteration - 1) * Block_N;
+
+        int cur_count = 0;
+        int max_count =
+            (num_n_tile_iteration - 1) * count_per_iter +
+            block_tile_welford_calculate_max_count<typename Problem::BlockShape>(last_iter_n);
+        auto block_welford      = Policy::template GetBlockWelford<Problem>();
+        auto block_welford_sync = Policy::template GetBlockWelfordSync<Problem>();
+        auto block_welford_cross_warp_sync =
+            Policy::template GetBlockWelfordCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(load_tile(x_window));
+        auto mean         = block_welford.template MakeMeanVarBlockTile<XTensorType>();
+        auto var          = block_welford.template MakeMeanVarBlockTile<XTensorType>();
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            block_welford(x, mean, var, cur_count, max_count);
+            move_tile_window(x_window, {0, Block_N});
+        }
+
+        block_welford_sync(mean, var, cur_count);
+        block_welford_cross_warp_sync(mean, var, cur_count, smem);
+        block_tile_welford_post_scale_var(var, cur_count);
+
+        // compute inv-std
+        auto inv_std = tile_elementwise_in(
+            [&](const auto& v_) {
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_) + epsilon);
+            },
+            var);
+
+        if constexpr(kSaveMean)
+            store_tile(mean_window, cast_tile<MeanDataType>(mean));
+        if constexpr(kSaveInvStd)
+            store_tile(inv_std_window, cast_tile<InvStdDataType>(inv_std));
+
+        // reverse read x to reuse cache
+        ck_tile::index_t stride_to_right_most_window =
+            row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
+
+        // x_window.foo();
+        // gamma_window.foo();
+        move_tile_window(x_window, {0, -Block_N});
+        move_tile_window(gamma_window, {stride_to_right_most_window});
+        move_tile_window(beta_window, {stride_to_right_most_window});
+        move_tile_window(y_window, {0, stride_to_right_most_window});
+
+        // layernorm computation
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            // load gamma/beta (TODO: support no gamma/beta?)
+            const auto gamma = load_tile(gamma_window);
+            const auto beta  = load_tile(beta_window);
+
+            auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
+
+            sweep_tile(y, [&, mean_ = mean](auto idx) {
+                constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+                constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+                const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+                const auto beta_  = type_convert<ComputeDataType>(beta[j_idx]);
+
+                const auto x_ = type_convert<ComputeDataType>(x[idx]);
+                auto y_       = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
+
+                y(idx) = type_convert<YDataType>(y_);
+            });
+
+            store_tile(y_window, y);
+
+            move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(gamma_window, {-Block_N});
+            move_tile_window(beta_window, {-Block_N});
+            move_tile_window(y_window, {0, -Block_N});
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp b/include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp
deleted file mode 100644
index 1ff541d84..000000000
--- a/include/ck_tile/ops/layernorm2d/pipeline/tile_layernorm2d_fwd_shape.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-template <typename ThreadTile, // Sequence<...
-          typename WarpTile,   // Sequence<...
-          typename BlockTile>  // Sequence<...
-struct TileLayernorm2dShape
-{
-    static constexpr index_t kMPerThread = ThreadTile::at(number<0>{});
-    static constexpr index_t kNPerThread = ThreadTile::at(number<1>{});
-
-    static constexpr index_t kMPerWarp = WarpTile::at(number<0>{});
-    static constexpr index_t kNPerWarp = WarpTile::at(number<1>{});
-
-    static constexpr index_t kMThreadPerWarp = kMPerWarp / kMPerThread;
-    static constexpr index_t kNThreadPerWarp = kNPerWarp / kNPerThread;
-
-    static constexpr index_t kMPerBlock = BlockTile::at(number<0>{});
-    static constexpr index_t kNPerBlock = BlockTile::at(number<1>{});
-
-    static constexpr index_t kMWarpPerBlock = kMPerBlock / kMPerWarp;
-    static constexpr index_t kNWarpPerBlock = kNPerBlock / kNPerWarp;
-
-    // TODO - kNNumWarps can only be 1 if we don't support cross warp welford
-    static_assert(kNWarpPerBlock == 1);
-
-    static constexpr index_t kBlockSize = warpSize * kMWarpPerBlock * kNWarpPerBlock;
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 682d60d87..63c364331 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/welford.hpp b/include/ck_tile/ops/welford.hpp
index dffaad750..ebf940683 100644
--- a/include/ck_tile/ops/welford.hpp
+++ b/include/ck_tile/ops/welford.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "ck_tile/ops/welford/block/block_welford.hpp"
+#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
 #include "ck_tile/ops/welford/thread/thread_welford.hpp"
-#include "ck_tile/ops/welford/warp/warp_welford.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp
new file mode 100644
index 000000000..55d55402d
--- /dev/null
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/welford/thread/thread_welford.hpp"
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = void>
+struct BlockWelford
+{
+    using Problem         = remove_cvref_t<Problem_>;
+    using XDataType       = typename Problem::XDataType;
+    using ComputeDataType = typename Problem::ComputeDataType;
+
+    CK_TILE_DEVICE constexpr BlockWelford() {}
+
+    // [CAUSION] - max_count_ is to deal with the padding problem
+    // max_count_ is depend on caller, eg: naive and splitN welford will have different
+    // calculation of max_count_
+    // -> use block_welford_calculate_max_count to compute
+    template <typename XDistributedTensor_,
+              typename MeanDistributedTensor_,
+              typename VarDistributedTensor_>
+    CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
+                                   MeanDistributedTensor_& mean_tensor,
+                                   VarDistributedTensor_& var_tensor,
+                                   int& cur_count_, // -> prefer init as zero
+                                   const int& max_count_)
+    {
+        constexpr auto I0 = number<0>{};
+        constexpr auto I1 = number<1>{};
+
+        constexpr auto spans = XDistributedTensor_::get_distributed_spans();
+
+        sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
+            if(cur_count_ < max_count_)
+            {
+                ++cur_count_;
+
+                sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
+                    constexpr auto in_dstr_idx  = make_tuple(dstr_idx_i0, dstr_idx_i1);
+                    constexpr auto out_dstr_idx = make_tuple(dstr_idx_i0);
+
+                    auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
+
+                    welford_update(
+                        mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x, cur_count_);
+                });
+            }
+        });
+    }
+
+    template <typename XDistributedTensor_>
+    CK_TILE_DEVICE static auto MakeMeanVarBlockTile()
+    {
+        static_assert(std::is_same_v<XDataType, typename XDistributedTensor_::DataType>, "wrong!");
+
+        constexpr auto reduce_dims = sequence<1>{};
+
+        constexpr auto dstr =
+            make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
+                XDistributedTensor_::get_tile_distribution()
+                    .get_static_tile_distribution_encoding(),
+                reduce_dims));
+
+        auto tensor = make_static_distributed_tensor<ComputeDataType>(dstr);
+
+        return tensor;
+    }
+
+    template <typename XDistributedTensor_>
+    CK_TILE_DEVICE auto
+    operator()(const XDistributedTensor_& x_tensor, int& cur_count_, const int& max_count_)
+    {
+        auto mean_tensor = MakeMeanVarBlockTile<XDistributedTensor_>();
+        auto var_tensor  = MakeMeanVarBlockTile<XDistributedTensor_>();
+        clear_tile(mean_tensor);
+        clear_tile(var_tensor);
+
+        (*this)(x_tensor, mean_tensor, var_tensor, cur_count_, max_count_);
+
+        return ck_tile::make_tuple(mean_tensor, var_tensor);
+    }
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct BlockWelfordSync
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
+    CK_TILE_DEVICE void
+    operator()(MeanDistributedTensor_& mean_tensor, VarDistributedTensor_& var_tensor, int& count)
+    {
+        using Dstr             = typename MeanDistributedTensor_::StaticTileDistribution;
+        using DstrEncode       = typename Dstr::DstrEncode;
+        using DstrEncodeDetail = typename DstrEncode::detail;
+
+        static_assert(std::is_same_v<Dstr, typename VarDistributedTensor_::StaticTileDistribution>,
+                      "wrong!");
+
+        constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
+        constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+        constexpr index_t idim_p_lane = NDimP - 1;
+
+        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
+        // const auto rs_idx =
+        //     mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
+
+        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
+        static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
+
+        const int original_count = count;
+
+        // loop over thread data
+        static_for<0, thread_buf_size, 1>{}([&](auto i) {
+            auto v_local_mean  = mean_tensor.get_thread_buffer()[i];
+            auto v_local_var   = var_tensor.get_thread_buffer()[i];
+            auto v_local_count = original_count;
+
+            // cross-lane reduce for replication
+            // only reduce on R dimension correspond to lane
+            // (lane id maps to this R dimension)
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                // FIXME: nasty to use does_p_own_r_
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+
+                    constexpr index_t lid_over_rid_derivative =
+                        DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
+
+                    static_assert(is_power_of_two_integer(r_length),
+                                  "wrong! only support power of 2 reduction");
+
+                    constexpr index_t nstage = integer_log2_floor(r_length);
+
+                    // reduction sweep forward
+                    static_for<0, nstage, 1>{}([&](auto istage) {
+                        // xor
+                        index_t src_lane =
+                            (__lane_id()) ^
+                            (number<lid_over_rid_derivative << istage.value>{}.value);
+
+                        // pull data from remote lane
+                        const auto v_remote_mean  = warp_shuffle(v_local_mean, src_lane);
+                        const auto v_remote_var   = warp_shuffle(v_local_var, src_lane);
+                        const auto v_remote_count = warp_shuffle(v_local_count, src_lane);
+
+                        // welford merge
+                        welford_merge(v_local_mean,
+                                      v_local_var,
+                                      v_local_count,
+                                      v_remote_mean,
+                                      v_remote_var,
+                                      v_remote_count);
+                    });
+                }
+            });
+
+            mean_tensor.get_thread_buffer()(i) = v_local_mean;
+            var_tensor.get_thread_buffer()(i)  = v_local_var;
+
+            count = v_local_count;
+        });
+    }
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct BlockWelfordCrossWarpSync
+{
+    using Problem    = remove_cvref_t<Problem_>;
+    using BlockShape = typename Problem::BlockShape;
+
+    template <typename MeanDistributedTensor_>
+    CK_TILE_DEVICE static constexpr index_t GetReduceWarps()
+    {
+        constexpr index_t num_reduce_warps = [&]() {
+            using Dstr             = typename MeanDistributedTensor_::StaticTileDistribution;
+            using DstrEncode       = typename Dstr::DstrEncode;
+            using DstrEncodeDetail = typename DstrEncode::detail;
+
+            constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+            constexpr index_t idim_p_warp = 0;
+
+            index_t len_ = 1;
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_warp][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+                    len_ *= r_length;
+                }
+            });
+            return len_;
+        }();
+        return num_reduce_warps;
+    }
+
+    // return in byte
+    template <typename MeanDistributedTensor_>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        // constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
+
+        // data need to exchange is very small, we just pack mean+var+count -> 4dword
+        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
+
+        // we need to store all data from every wave into smem
+        // e.g. 2x2 reduce along N
+        //     -------------> reduce N
+        //    | w0 | w1 |   ___>      | w01 |
+        //    | w2 | w3 |             | w23 |
+        //
+        //   -> store data from every wave into LDS
+        //
+        //
+        //     -------------> reduce N
+        //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
+        //
+        //   -> also store data from every wave into LDS
+        constexpr index_t num_warps = BlockShape::BlockSize / warpSize;
+        return num_warps * 4 * thread_buf_size * sizeof(float);
+    }
+
+    template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
+    CK_TILE_DEVICE void operator()(MeanDistributedTensor_& mean_tensor,
+                                   VarDistributedTensor_& var_tensor,
+                                   int& count,
+                                   void* smem)
+    {
+        using DataType = typename MeanDistributedTensor_::DataType;
+        using Dstr     = typename MeanDistributedTensor_::StaticTileDistribution;
+        // using DstrEncode       = typename Dstr::DstrEncode;
+        // using DstrEncodeDetail = typename DstrEncode::detail;
+
+        static_assert(std::is_same_v<Dstr, typename VarDistributedTensor_::StaticTileDistribution>,
+                      "wrong!");
+
+        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
+        static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
+
+        // Note: we always pack everything into fp32x4
+        fp32x4_t* smem_ptr              = reinterpret_cast<fp32x4_t*>(smem);
+        const index_t lane_id           = get_lane_id();
+        const index_t warp_id           = get_warp_id();
+        constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
+        constexpr index_t num_warps     = BlockShape::BlockSize / warpSize;
+        const index_t smem_offset       = warp_id;
+
+        // skip if nonthing to do
+        if constexpr(num_reduce_warps == 1)
+            return;
+
+        // store into smem only for lane-0 within one warp
+        if(lane_id == 0)
+        {
+            static_for<0, thread_buf_size, 1>{}([&](auto i) {
+                fp32x4_t local_scratch_;
+                local_scratch_[0] = bit_cast<float>(mean_tensor.get_thread_buffer()[i]);
+                local_scratch_[1] = bit_cast<float>(var_tensor.get_thread_buffer()[i]);
+                local_scratch_[2] = bit_cast<float>(count);
+
+                smem_ptr[smem_offset + i * num_warps] = local_scratch_;
+            });
+        }
+        block_sync_lds();
+
+        // load from smem. here we let everythread to do compute :)
+        index_t local_warp_id = warp_id / num_reduce_warps;
+        index_t local_smem_os = local_warp_id * num_reduce_warps;
+        fp32x4_t all_scratch[thread_buf_size * num_reduce_warps];
+        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
+            static_for<0, num_reduce_warps, 1>{}([&](auto i_1) {
+                all_scratch[i_0 * num_warps + i_1] =
+                    smem_ptr[i_0 * num_reduce_warps + local_smem_os + i_1];
+            });
+        });
+        block_sync_lds(); // TODO: we don't need sync here
+
+        // const int original_count = count;
+
+        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
+            // TODO: use descriptor for this
+            auto v_local       = all_scratch[i_0 * num_warps];
+            auto v_local_mean  = bit_cast<DataType>(v_local[0]);
+            auto v_local_var   = bit_cast<DataType>(v_local[1]);
+            auto v_local_count = bit_cast<int>(v_local[2]);
+
+            // further reduce mean/var
+            static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) {
+                constexpr auto i_1        = number<i_1_n1 + 1>{};
+                const fp32x4_t v_remote   = all_scratch[i_0 * num_warps + i_1];
+                const auto v_remote_mean  = bit_cast<DataType>(v_remote[0]);
+                const auto v_remote_var   = bit_cast<DataType>(v_remote[1]);
+                const auto v_remote_count = bit_cast<int>(v_remote[2]);
+
+                welford_merge(v_local_mean,
+                              v_local_var,
+                              v_local_count,
+                              v_remote_mean,
+                              v_remote_var,
+                              v_remote_count);
+            });
+
+            mean_tensor.get_thread_buffer()(i_0) = v_local_mean;
+            var_tensor.get_thread_buffer()(i_0)  = v_local_var;
+
+            count = v_local_count;
+        });
+    }
+};
+
+// compute the max count for a last dim reduce
+// everything may have vector/repeat, so the max count could be uneven
+// TODO: specify which dim to compute and proper set the problem
+// TODO: BlockShape we reuse layernorm_fwd_shape :)
+template <typename BlockShape>
+CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_size)
+{
+#if 0
+    using S                   = BlockShape;
+    index_t LastloopN         = row_size % S::Block_N == 0 ? S::Block_N : row_size % S::Block_N;
+    constexpr index_t NThread = S::WarpPerBlock_N * S::ThreadPerWarp_N;
+    index_t iNLane            = get_thread_id() % NThread;
+    index_t iN0               = LastloopN / (S::Vector_N * S::ThreadPerWarp_N);
+    index_t iN1               = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) / S::Vector_N;
+    index_t N2                = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) % S::Vector_N;
+    index_t iN3               = iNLane < iN1 ? S::Vector_N : iNLane == iN1 ? N2 : 0;
+    return iN0 * S::Vector_N + iN3;
+#endif
+    using S_                            = BlockShape;
+    constexpr index_t ThreadsPerBlock_N = S_::WarpPerBlock_N * S_::ThreadPerWarp_N;
+
+    // TODO: we always check vector size, need be evenly devidable by vector-n
+    const index_t element_per_row = row_size / S_::Vector_N;
+    index_t lane_id_n             = get_thread_id() % ThreadsPerBlock_N;
+
+    index_t cnt = 0;
+    // TODO: Repeat_N can not be too long, otherwise this is not good
+    static_for<0, S_::Repeat_N, 1>{}([&](auto) {
+        index_t _a = lane_id_n < element_per_row ? 1 : 0;
+        cnt += _a;
+        lane_id_n += ThreadsPerBlock_N;
+    });
+    return cnt * S_::Vector_N;
+}
+
+// Note: this function must be called after all the computation
+template <typename VarDistributedTensor_>
+CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTensor_& var_tensor,
+                                                                int count)
+{
+    using DataType = typename VarDistributedTensor_::DataType;
+    tile_elementwise_inout([&count](auto& x) { x = x / type_convert<DataType>(count); },
+                           var_tensor);
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/block/block_welford_problem.hpp b/include/ck_tile/ops/welford/block/block_welford_problem.hpp
new file mode 100644
index 000000000..dcae1ef2e
--- /dev/null
+++ b/include/ck_tile/ops/welford/block/block_welford_problem.hpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType_, typename ComputeDataType_, typename BlockShape_>
+struct BlockWelfordProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/welford/thread/thread_welford.hpp
index 2ca9a2365..4c61cdcf4 100644
--- a/include/ck_tile/ops/welford/thread/thread_welford.hpp
+++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp
@@ -7,95 +7,30 @@
 
 namespace ck_tile {
 
-template <typename ComputeDataType_, typename XDataType_>
-struct ThreadWelford
+template <typename T>
+CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count)
 {
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-
-    template <typename T>
-    CK_TILE_DEVICE void Update(T& mean, T& var, T x)
-    {
-        if(ck_tile::isnan(x))
-        {
-            mean = x;
-            var  = x;
-        }
-        else
-        {
-            T delta = x - mean;
-            mean += delta / cur_count_;
-            T delta2 = x - mean;
-            var += delta * delta2;
-        }
-    }
-
-    // [CAUSION] - max_count_ is to deal with the padding problem
-    // max_count_ is depend on caller, eg: naive and splitN welford will have different
-    // calculation of max_count_
-    CK_TILE_DEVICE constexpr ThreadWelford(int max_count) : cur_count_(0), max_count_(max_count) {}
-
-    template <typename XDistributedTensor_,
-              typename MeanDistributedTensor_,
-              typename VarDistributedTensor_>
-    CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
-                                   MeanDistributedTensor_& mean_tensor,
-                                   VarDistributedTensor_& var_tensor)
-    {
-        constexpr auto I0 = number<0>{};
-        constexpr auto I1 = number<1>{};
-
-        constexpr auto spans = XDistributedTensor_::get_distributed_spans();
-
-        sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
-            if(cur_count_ < max_count_)
-            {
-                ++cur_count_;
-
-                sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
-                    constexpr auto in_dstr_idx  = make_tuple(dstr_idx_i0, dstr_idx_i1);
-                    constexpr auto out_dstr_idx = make_tuple(dstr_idx_i0);
-
-                    auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
-
-                    Update(mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x);
-                });
-            }
-        });
-    }
-
-    template <typename XDistributedTensor_>
-    CK_TILE_DEVICE static auto MakeInitialMeanVarDistributedTensor()
-    {
-        static_assert(std::is_same_v<XDataType, typename XDistributedTensor_::DataType>, "wrong!");
-
-        constexpr auto reduce_dims = sequence<1>{};
-
-        constexpr auto dstr =
-            make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
-                XDistributedTensor_::get_tile_distribution()
-                    .get_static_tile_distribution_encoding(),
-                reduce_dims));
-
-        auto tensor = make_static_distributed_tensor<ComputeDataType>(dstr);
-        clear_tile(tensor);
-
-        return tensor;
-    }
-
-    template <typename XDistributedTensor_>
-    CK_TILE_DEVICE auto operator()(const XDistributedTensor_& x_tensor)
-    {
-        auto mean_tensor = MakeInitialMeanVarDistributedTensor<XDistributedTensor_>();
-        auto var_tensor  = MakeInitialMeanVarDistributedTensor<XDistributedTensor_>();
-
-        (*this)(x_tensor, mean_tensor, var_tensor);
-
-        return ck_tile::make_tuple(mean_tensor, var_tensor);
-    }
-
-    int cur_count_;
-    int max_count_;
-};
+    // TODO: check nan? maybe no
+    T delta = x - mean;
+    mean += delta / count;
+    T delta2 = x - mean;
+    var += delta * delta2;
+}
+
+template <typename T>
+CK_TILE_DEVICE static void
+welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+{
+    int count            = count_a + count_b;
+    T count_             = type_convert<T>(count);
+    T count_a_           = type_convert<T>(count_a);
+    T count_b_           = type_convert<T>(count_b);
+    T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
+
+    T delta = mean_b - mean_a;
+    mean_a += delta * count_b_over_count;
+    var_a += var_b + delta * delta * count_a_ * count_b_over_count;
+    count_a = count;
+}
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/warp/warp_welford.hpp b/include/ck_tile/ops/welford/warp/warp_welford.hpp
deleted file mode 100644
index 687b61f43..000000000
--- a/include/ck_tile/ops/welford/warp/warp_welford.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-
-template <typename ComputeDataType_, bool BroadcastLane = true, bool GetActualVariance = true>
-struct WarpMergeWelford
-{
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-
-    template <typename T>
-    CK_TILE_DEVICE static void
-    Merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
-    {
-        int count            = count_a + count_b;
-        T count_             = type_convert<T>(count);
-        T count_a_           = type_convert<T>(count_a);
-        T count_b_           = type_convert<T>(count_b);
-        T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
-
-        T delta = mean_b - mean_a;
-        mean_a += delta * count_b_over_count;
-        var_a += var_b + delta * delta * count_a_ * count_b_over_count;
-        count_a = count;
-    }
-
-    template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
-    CK_TILE_DEVICE void
-    operator()(MeanDistributedTensor_& mean_tensor, VarDistributedTensor_& var_tensor, int& count)
-    {
-        using Dstr             = typename MeanDistributedTensor_::StaticTileDistribution;
-        using DstrEncode       = typename Dstr::DstrEncode;
-        using DstrEncodeDetail = typename DstrEncode::detail;
-
-        static_assert(std::is_same_v<Dstr, typename VarDistributedTensor_::StaticTileDistribution>,
-                      "wrong!");
-
-        constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
-        constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
-
-        constexpr index_t idim_p_lane = NDimP - 1;
-
-        const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
-        const auto rs_idx =
-            mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
-
-        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
-        static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
-
-        const int original_count = count;
-
-        // loop over thread data
-        static_for<0, thread_buf_size, 1>{}([&](auto i) {
-            auto v_local_mean  = mean_tensor.get_thread_buffer()[i];
-            auto v_local_var   = var_tensor.get_thread_buffer()[i];
-            auto v_local_count = original_count;
-
-            // cross-lane reduce for replication
-            // only reduce on R dimension correspond to lane
-            // (lane id maps to this R dimension)
-            static_for<0, NDimR, 1>{}([&](auto idim_r) {
-                // FIXME: nasty to use does_p_own_r_
-                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
-                {
-                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
-
-                    constexpr index_t lid_over_rid_derivative =
-                        DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
-
-                    static_assert(is_power_of_two_integer(r_length),
-                                  "wrong! only support power of 2 reduction");
-
-                    constexpr index_t nstage = integer_log2_floor(r_length);
-
-                    // reduction sweep forward
-                    static_for<0, nstage, 1>{}([&](auto istage) {
-                        constexpr index_t lid_delta =
-                            lid_over_rid_derivative * (1 << (nstage - istage - 1));
-
-                        // pull data from remote lane
-                        const auto v_remote_mean  = warp_shuffle_down(v_local_mean, lid_delta);
-                        const auto v_remote_var   = warp_shuffle_down(v_local_var, lid_delta);
-                        const auto v_remote_count = warp_shuffle_down(v_local_count, lid_delta);
-
-                        // welford merge
-                        Merge(v_local_mean,
-                              v_local_var,
-                              v_local_count,
-                              v_remote_mean,
-                              v_remote_var,
-                              v_remote_count);
-                    });
-                }
-            });
-
-            // cross-lane broadcast for replication
-            // only broadcast on R dimension correspond to lane
-            // (lane id maps to this R dimension)
-            if constexpr(BroadcastLane)
-            {
-                static_for<0, NDimR, 1>{}([&](auto idim_r) {
-                    // FIXME: nasty to use does_p_own_r_
-                    if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
-                    {
-                        const index_t r_id = rs_idx[idim_r];
-
-                        constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
-
-                        constexpr index_t lid_over_rid_derivative =
-                            DstrEncodeDetail::ps_over_rs_derivative_[NDimP - 1][idim_r];
-
-                        static_assert(is_power_of_two_integer(r_length),
-                                      "wrong! only support power of 2 reduction");
-
-                        constexpr index_t nstage = integer_log2_floor(r_length);
-
-                        // broadcast sweep backward
-                        static_for<0, nstage, 1>{}([&](auto istage) {
-                            // do I hold reduced data?
-                            const bool do_i_hold_reduced_data = r_id < (1 << istage);
-
-                            constexpr index_t lid_delta = lid_over_rid_derivative * (1 << istage);
-
-                            // pull data from remote lane
-                            const auto v_remote_mean  = warp_shuffle_up(v_local_mean, lid_delta);
-                            const auto v_remote_var   = warp_shuffle_up(v_local_var, lid_delta);
-                            const auto v_remote_count = warp_shuffle_up(v_local_count, lid_delta);
-
-                            // decide whether to update local data with remote data
-                            v_local_mean  = do_i_hold_reduced_data ? v_local_mean : v_remote_mean;
-                            v_local_var   = do_i_hold_reduced_data ? v_local_var : v_remote_var;
-                            v_local_count = do_i_hold_reduced_data ? v_local_count : v_remote_count;
-                        });
-                    }
-                });
-            }
-
-            mean_tensor.get_thread_buffer()(i) = v_local_mean;
-
-            if constexpr(GetActualVariance)
-                var_tensor.get_thread_buffer()(i) = v_local_var / v_local_count;
-            else
-                var_tensor.get_thread_buffer()(i) = v_local_var;
-
-            count = v_local_count;
-        });
-    }
-};
-
-} // namespace ck_tile
-- 
GitLab


From 82fc53835aabb044d2ef15f485d0a2c8d52b4702 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 22 Oct 2024 16:18:28 +0200
Subject: [PATCH 009/153] Enable grouped conv bwd wei bf16 NGCHW (#1589)

* Enable grouped conv bwd wei bf16 NGCHW

* fixes

* fixes

* Fixes

* fixes

* fixes

* Fixes
---
 ...conv_bwd_weight_two_stage_xdl_instance.hpp | 71 +++++++++++++-
 ...e_grouped_conv_bwd_weight_xdl_instance.hpp |  4 +-
 .../grouped_convolution_backward_weight.hpp   | 48 ++++++++++
 ...rouped_convolution_backward_weight_xdl.inc | 94 +++++++++++++++++++
 .../grouped_conv1d_bwd_weight/CMakeLists.txt  |  6 +-
 ...gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} |  2 +-
 ...nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp} |  2 +-
 ...gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} | 28 +++---
 .../grouped_conv2d_bwd_weight/CMakeLists.txt  | 12 ++-
 ...wc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} |  2 +-
 ...gc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} |  2 +-
 ...ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp | 41 ++++++++
 ...ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp | 41 ++++++++
 ...nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp | 41 ++++++++
 ...nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp | 41 ++++++++
 ...wc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} | 28 +++---
 ...gc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} | 28 +++---
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  | 12 ++-
 ..._gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} |  2 +-
 ..._gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} |  2 +-
 ...wgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp | 41 ++++++++
 ...wgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp | 41 ++++++++
 ...dhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp | 41 ++++++++
 ...dhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp | 41 ++++++++
 ..._gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} | 28 +++---
 ..._gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} | 28 +++---
 .../src/profile_grouped_conv_bwd_weight.cpp   | 26 ++++-
 script/convert_miopen_driver_to_profiler.py   |  5 +-
 28 files changed, 667 insertions(+), 91 deletions(-)
 rename library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/{device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp => device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} (96%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/{device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp => device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp} (96%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/{device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp => device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp} (60%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/{device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} (97%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/{device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} (97%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp} (62%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp => device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp} (61%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/{device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} (96%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/{device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} (97%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp} (60%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp => device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp} (61%)

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index 2ce334d9d..5f6c340e4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -15,8 +15,9 @@ namespace instance {
 
 using namespace ck::tensor_layout::convolution;
 
-using F16 = ck::half_t;
-using F32 = float;
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
 
 using Empty_Tuple = ck::Tuple<>;
 
@@ -45,17 +46,42 @@ using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances = std
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
 
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
     // clang-format on
     >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = std::tuple<
+    // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
+    // clang-format on
+    >;
+
 // NGCHW requires transpose, we use vector loads and stores params for them
 template <ck::index_t NDimSpatial,
           typename ALayout,
@@ -96,6 +122,45 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances
     // clang-format on
     >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances = std::tuple<
+    // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>,
+    
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 1, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8 ,1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
index 096e0b177..32f52770b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -113,7 +113,7 @@ template <ck::index_t NDimSpatial,
           typename BLayout,
           typename ELayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple<
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances = std::tuple<
     // clang-format off
         //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|               ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
         //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                     Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index 0f11d337f..797233be0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -367,6 +367,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
                         op_ptrs);
                 }
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances(
+                        op_ptrs);
+                }
 #endif
             }
             if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKYXC> &&
@@ -382,6 +393,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
                         op_ptrs);
                 }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
+                        op_ptrs);
+                }
 #endif
             }
         }
@@ -453,6 +477,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
                         op_ptrs);
                 }
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances(
+                        op_ptrs);
+                }
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
                 if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
@@ -477,6 +512,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
                         op_ptrs);
                 }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
+                        op_ptrs);
+                }
 #endif
             }
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
index f240fa323..5f6f2fc6f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -100,6 +100,53 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
@@ -226,6 +273,53 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
index ab4313d89..b057e0c8d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
@@ -2,16 +2,16 @@
 set(GROUPED_CONV1D_BWD_WEIGHT
     xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
     xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
-    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp)
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp)
 
 if(DL_KERNELS)
     list(APPEND GROUPED_CONV1D_BWD_WEIGHT
         dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp
         dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp
-        dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp
+        dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
         dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp
         dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp
-        dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp)
+        dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp)
 endif()
 
 add_instance_library(device_grouped_conv1d_bwd_weight_instance ${GROUPED_CONV1D_BWD_WEIGHT})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
index d7a15784a..59981b642 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp
index a92cb4285..a2ac640d3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
similarity index 60%
rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
index f9368ab57..9c97d80c8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -24,19 +24,21 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_insta
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<1,
-                                                                    GNWC,
-                                                                    GKXC,
-                                                                    GNWK,
-                                                                    ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            1,
+            GNWC,
+            GKXC,
+            GNWK,
+            ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
-                                       1,
-                                       GNWC,
-                                       GKXC,
-                                       GNWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            1,
+            GNWC,
+            GKXC,
+            GNWK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
index 8d67b46fb..ef99d69ae 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -2,24 +2,28 @@
 set(GROUPED_CONV2D_BWD_WEIGHT
     xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
     )
 
 if(DL_KERNELS)
     list(APPEND GROUPED_CONV2D_BWD_WEIGHT
         dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
         dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-        dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+        dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
         dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
         dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-        dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp)
+        dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp)
 endif()
 
 add_instance_library(device_grouped_conv2d_bwd_weight_instance ${GROUPED_CONV2D_BWD_WEIGHT})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
index 37b465e6c..63d20524f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
index cf3db8331..a615edfac 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
new file mode 100644
index 000000000..9fbdc6c46
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
+            2,
+            NGCHW,
+            GKYXC,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
new file mode 100644
index 000000000..e1c865a88
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
+            2,
+            NGCHW,
+            GKYXC,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
new file mode 100644
index 000000000..0e4d085de
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
new file mode 100644
index 000000000..680494cfd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
similarity index 62%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
index 17f5ee4e2..69e22dee4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -25,19 +25,21 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_in
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
-                                                                    GNHWC,
-                                                                    GKYXC,
-                                                                    GNHWK,
-                                                                    ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            2,
+            GNHWC,
+            GKYXC,
+            GNHWK,
+            ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
-                                       2,
-                                       GNHWC,
-                                       GKYXC,
-                                       GNHWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            2,
+            GNHWC,
+            GKYXC,
+            GNHWK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
similarity index 61%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
index 614cc0a7e..cac935335 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -25,19 +25,21 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
-                                                                    NHWGC,
-                                                                    GKYXC,
-                                                                    NHWGK,
-                                                                    ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
-                                       2,
-                                       NHWGC,
-                                       GKYXC,
-                                       NHWGK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index 7857bb029..2ceac45f9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -2,24 +2,28 @@
 set(GROUPED_CONV3D_BWD_WEIGHT
      xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
      xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
     )
 
 if(DL_KERNELS)
     list(APPEND GROUPED_CONV3D_BWD_WEIGHT
         dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
         dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
-        dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+        dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
         dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
         dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-        dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp)
+        dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp)
 endif()
 
 list(APPEND GROUPED_CONV3D_BWD_WEIGHT
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
index c9646d085..eadb7afd6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
index a37e6cbf3..b39babf3e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
new file mode 100644
index 000000000..549716586
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
new file mode 100644
index 000000000..18a00c6ea
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
new file mode 100644
index 000000000..ac6cb8268
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
+            3,
+            NGCDHW,
+            GKZYXC,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
new file mode 100644
index 000000000..705f5e8ce
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
+            3,
+            NGCDHW,
+            GKZYXC,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
similarity index 60%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
index 91d80e4f7..81d64344f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -24,19 +24,21 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
-                                                                    GNDHWC,
-                                                                    GKZYXC,
-                                                                    GNDHWK,
-                                                                    ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            3,
+            GNDHWC,
+            GKZYXC,
+            GNDHWK,
+            ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
-                                       3,
-                                       GNDHWC,
-                                       GKZYXC,
-                                       GNDHWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            3,
+            GNDHWC,
+            GKZYXC,
+            GNDHWK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
similarity index 61%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
index a394e0d6f..679f30a3d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -25,19 +25,21 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
-                                                                    NDHWGC,
-                                                                    GKZYXC,
-                                                                    NDHWGK,
-                                                                    ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
-                                       3,
-                                       NDHWGC,
-                                       GKZYXC,
-                                       NDHWGK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index 8533f3e8f..9872ff8ac 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -25,7 +25,8 @@ enum struct ConvDataType
     F16_F16_F16,        // 1
     BF16_F32_BF16,      // 2
     F16_F16_F16_BF8_F8, // 3
-    I8_I8_I8            // 4
+    I8_I8_I8,           // 4
+    BF16_BF16_BF16,     // 5
 };
 
 #define OP_NAME "grouped_conv_bwd_weight"
@@ -38,7 +39,8 @@ static void print_helper_msg()
               << "                 1: Input fp16, Weight fp16, Output fp16\n"
               << "                 2: Input bf16, Weight fp32, Output bf16\n"
               << "                 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n"
-              << "                 4: Input int8, Weight int8, Output int8)\n"
+              << "                 4: Input int8, Weight int8, Output int8\n"
+              << "                 5: Input bf16, Weight bf16, Output bf16)\n"
               << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
                  "N, K, Ho, Wo]\n"
               << "                     1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
@@ -187,6 +189,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
         {
             return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
         }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            // fp32 atomic add is used for weight tensor in bf16 kernel
+            return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
     }
     if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
     {
@@ -203,6 +210,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
             // fp32 atomic add is used for weight tensor in bf16 kernel
             return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
         }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
         else if(data_type == ConvDataType::I8_I8_I8)
         {
             return profile(
@@ -224,6 +236,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
             // fp32 atomic add is used for weight tensor in bf16 kernel
             return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
         }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
         if(data_type == ConvDataType::F16_F16_F16_BF8_F8)
         {
             return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{});
@@ -240,6 +257,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
         {
             return profile(I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
         }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
     }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index d9f5050d0..5bcaf1448 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -65,8 +65,9 @@ def parse_data_type(args):
         if args.ck_profier_op == "grouped_conv_fwd":
             args.data_type = 3
     if args.data_type == "bfp16":
-        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
-           args.ck_profier_op == "grouped_conv_bwd_data" or \
+        if args.ck_profier_op == "grouped_conv_bwd_weight":
+            args.data_type = 5
+        if args.ck_profier_op == "grouped_conv_bwd_data" or \
            args.ck_profier_op == "grouped_conv_fwd":
             args.data_type = 2
 
-- 
GitLab


From 4d5248e2d17770234f433f1a83aa0294ff60c7b1 Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com>
Date: Tue, 22 Oct 2024 19:17:32 +0100
Subject: [PATCH 010/153] Explicit cast values to half (#1593)

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 include/ck/utility/math_v2.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index cbbe15585..b374c4ad5 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -653,7 +653,7 @@ inline __device__ double sin<double>(double x)
 template <>
 inline __device__ half_t sin<half_t>(half_t x)
 {
-    return ::hsin(x);
+    return hsin(static_cast<__half>(x));
 };
 
 template <typename T>
@@ -785,7 +785,7 @@ inline __device__ double ceil<double>(double x)
 template <>
 inline __device__ half_t ceil<half_t>(half_t x)
 {
-    return ::hceil(x);
+    return hceil(static_cast<__half>(x));
 };
 
 template <typename T>
@@ -827,7 +827,7 @@ inline __device__ double floor<double>(double x)
 template <>
 inline __device__ half_t floor<half_t>(half_t x)
 {
-    return ::hfloor(x);
+    return hfloor(static_cast<__half>(x));
 };
 
 template <typename T>
@@ -849,7 +849,7 @@ inline __device__ T exp(T x)
 template <>
 inline __device__ half_t exp<half_t>(half_t x)
 {
-    return hexp(x);
+    return hexp(static_cast<__half>(x));
 };
 
 template <>
@@ -873,7 +873,7 @@ inline __device__ T log(T x)
 template <>
 inline __device__ half_t log<half_t>(half_t x)
 {
-    return hlog(x);
+    return hlog(static_cast<__half>(x));
 };
 
 template <>
-- 
GitLab


From cedccd59c94cb0c74e7ec0d0f6c791aed081febc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 23 Oct 2024 12:02:33 +0200
Subject: [PATCH 011/153] [POST MERGE PR] Enable grouped conv bwd wei bf16
 NGCHW (#1594)

---
 ...e_grouped_conv_bwd_weight_xdl_instance.hpp | 35 ++++++++++++++
 .../grouped_convolution_backward_weight.hpp   |  4 ++
 ...rouped_convolution_backward_weight_xdl.inc | 24 ++++++++++
 .../grouped_conv2d_bwd_weight/CMakeLists.txt  |  1 +
 ...ht_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 46 +++++++++++++++++++
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  |  1 +
 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 46 +++++++++++++++++++
 .../src/profile_grouped_conv_bwd_weight.cpp   |  9 ++--
 8 files changed, 161 insertions(+), 5 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
index 32f52770b..a08d73546 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
@@ -141,6 +141,41 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances = std
     // clang-format on
     >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
+        // instance for small conv.K
+        // for bf16 conv.K and conv.C must be divisible by 2
+        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,
+
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
 template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index 797233be0..db17f0f38 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -373,6 +373,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<ComputeTypeA, ck::bhalf_t> &&
                              is_same_v<ComputeTypeB, ck::bhalf_t>)
                 {
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances(
@@ -483,6 +485,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<ComputeTypeA, ck::bhalf_t> &&
                              is_same_v<ComputeTypeB, ck::bhalf_t>)
                 {
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
index 5f6f2fc6f..132dde81a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -89,6 +89,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                            PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -262,6 +274,18 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
                                                            PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
index ef99d69ae..546a62a8a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -6,6 +6,7 @@ set(GROUPED_CONV2D_BWD_WEIGHT
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 000000000..ee71e37e7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    NHWGK,
+                                                                    ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index 2ceac45f9..c8c30897c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -6,6 +6,7 @@ set(GROUPED_CONV3D_BWD_WEIGHT
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 000000000..f1ea37181
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index 9872ff8ac..4170ac65a 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -182,6 +182,10 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
             // fp32 atomic add is used for weight tensor in bf16 kernel
             return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
         }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
     }
     else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW)
     {
@@ -210,11 +214,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
             // fp32 atomic add is used for weight tensor in bf16 kernel
             return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
         }
-        if(data_type == ConvDataType::BF16_BF16_BF16)
-        {
-            return profile(
-                I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
-        }
         else if(data_type == ConvDataType::I8_I8_I8)
         {
             return profile(
-- 
GitLab


From 8e22e1ae31bbf7086f69d8724e027676791d351a Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 23 Oct 2024 15:55:39 -0700
Subject: [PATCH 012/153] fix the logic of enabling XDL and WMMA instances
 (#1595)

---
 CMakeLists.txt | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0700fe838..6a5180363 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -177,18 +177,14 @@ rocm_check_target_ids(SUPPORTED_GPU_TARGETS
 
 message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
 
-if (GPU_TARGETS)
-    if (GPU_TARGETS MATCHES "gfx9")
-        add_definitions(-DCK_USE_XDL)
-        set(CK_USE_XDL "ON")
-    endif()
-    if (GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
-        add_definitions(-DCK_USE_WMMA)
-        set(CK_USE_WMMA "ON")
-    endif()
-else()
-    add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+    message("Enabling XDL instances")
+    add_definitions(-DCK_USE_XDL)
     set(CK_USE_XDL "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+    message("Enabling WMMA instances")
+    add_definitions(-DCK_USE_WMMA)
     set(CK_USE_WMMA "ON")
 endif()
 
@@ -578,7 +574,7 @@ rocm_package_setup_component(profiler
 )
 add_subdirectory(profiler)
 
-if(CK_USE_CODEGEN AND (GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
+if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
   add_subdirectory(codegen)
 endif()
 
-- 
GitLab


From 9183ce69cac01374d0eafbdb4258cf1744b5a548 Mon Sep 17 00:00:00 2001
From: dummycoderfe <felixamd@163.com>
Date: Fri, 25 Oct 2024 11:17:45 +0800
Subject: [PATCH 013/153] hot_fix epsilon pos (#1597)

Co-authored-by: dummycoderfe <noplydummmycoder@163.com>
---
 .../layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp  | 2 +-
 .../layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index d73bcb29e..bf002141b 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -90,7 +90,7 @@ struct Layernorm2dFwdPipelineOnePass
         // compute inv-std
         auto inv_std = tile_elementwise_in(
             [&](const auto& v_) {
-                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_) + epsilon);
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ + epsilon));
             },
             var);
 
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index dcbfc87da..db094ac2a 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -105,7 +105,7 @@ struct Layernorm2dFwdPipelineTwoPass
         // compute inv-std
         auto inv_std = tile_elementwise_in(
             [&](const auto& v_) {
-                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_) + epsilon);
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ + epsilon));
             },
             var);
 
-- 
GitLab


From 9385caa3069b8b366c365765164df0c0b6b32925 Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Fri, 25 Oct 2024 12:46:24 +0200
Subject: [PATCH 014/153] Generic threshold calculation (#1546)

* Calculate generic relative threshold pool3dfwd

* Calculate absolute error threshold pool3d fwd

* Generic threshold calculation take max input for relative error pool3dfwd

* Remove max possible value for error calculation at runtime

* Remove debug print in pool3dfwd

* Pool3d fwd adjusted types in generic threshold calculation

* Generic threshold calculation take into account number of accumulations and accdatatype

* Generic threshold fix final error formula

* Generic threshold calculation - num of accs fix

* Generic threshold calculation - adjust absolute error

* Generic threshold calculation - OutDataType in absolute error
---
 include/ck/utility/data_type.hpp              |   9 ++
 .../include/ck/library/utility/check_err.hpp  | 127 ++++++++++++++++++
 .../profiler/profile_pool3d_fwd_impl.hpp      |  38 +++++-
 3 files changed, 167 insertions(+), 7 deletions(-)

diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index debeb472a..39f532e0e 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -1803,4 +1803,13 @@ struct NumericUtils<bf8_t>
     static constexpr int bias = 16; // negative zero nan mode
     // static constexpr int bias = 15; // ieee mode
 };
+
+template <>
+struct NumericUtils<bhalf_t>
+{
+    static constexpr int exp  = 8;
+    static constexpr int mant = 7;
+    static constexpr int bias = 128; // negative zero nan mode
+    // static constexpr int bias = 127; // ieee mode
+};
 } // namespace ck
diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp
index 58479f212..73ac2a189 100644
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -23,6 +23,130 @@
 namespace ck {
 namespace utils {
 
+template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
+double get_relative_threshold(const int numberOfAccumulations = 1)
+{
+    using F8   = ck::f8_t;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using F32  = float;
+    using I8   = int8_t;
+    using I32  = int32_t;
+
+    static_assert(is_same_v<ComputeDataType, F8> || is_same_v<ComputeDataType, F16> ||
+                      is_same_v<ComputeDataType, BF16> || is_same_v<ComputeDataType, F32> ||
+                      is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
+                      is_same_v<ComputeDataType, int>,
+                  "Warning: Unhandled ComputeDataType for setting up the relative threshold!");
+    double compute_error = 0;
+    if constexpr(is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
+                 is_same_v<ComputeDataType, int>)
+    {
+        return 0;
+    }
+    else
+    {
+        compute_error = std::pow(2, -NumericUtils<ComputeDataType>::mant) * 0.5;
+    }
+
+    static_assert(is_same_v<OutDataType, F8> || is_same_v<OutDataType, F16> ||
+                      is_same_v<OutDataType, BF16> || is_same_v<OutDataType, F32> ||
+                      is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
+                      is_same_v<OutDataType, int>,
+                  "Warning: Unhandled OutDataType for setting up the relative threshold!");
+    double output_error = 0;
+    if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
+                 is_same_v<OutDataType, int>)
+    {
+        return 0;
+    }
+    else
+    {
+        output_error = std::pow(2, -NumericUtils<OutDataType>::mant) * 0.5;
+    }
+    double midway_error = std::max(compute_error, output_error);
+
+    static_assert(is_same_v<AccDataType, F8> || is_same_v<AccDataType, F16> ||
+                      is_same_v<AccDataType, BF16> || is_same_v<AccDataType, F32> ||
+                      is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
+                      is_same_v<AccDataType, int>,
+                  "Warning: Unhandled AccDataType for setting up the relative threshold!");
+    double acc_error = 0;
+    if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
+                 is_same_v<AccDataType, int>)
+    {
+        return 0;
+    }
+    else
+    {
+        acc_error = std::pow(2, -NumericUtils<AccDataType>::mant) * 0.5 * numberOfAccumulations;
+    }
+    return std::max(acc_error, midway_error);
+}
+
+template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
+double get_absolute_threshold(const double max_possible_num, const int numberOfAccumulations = 1)
+{
+    using F8   = ck::f8_t;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using F32  = float;
+    using I8   = int8_t;
+    using I32  = int32_t;
+
+    static_assert(is_same_v<ComputeDataType, F8> || is_same_v<ComputeDataType, F16> ||
+                      is_same_v<ComputeDataType, BF16> || is_same_v<ComputeDataType, F32> ||
+                      is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
+                      is_same_v<ComputeDataType, int>,
+                  "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
+    auto expo            = std::log2(std::abs(max_possible_num));
+    double compute_error = 0;
+    if constexpr(is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
+                 is_same_v<ComputeDataType, int>)
+    {
+        return 0;
+    }
+    else
+    {
+        compute_error = std::pow(2, expo - NumericUtils<ComputeDataType>::mant) * 0.5;
+    }
+
+    static_assert(is_same_v<OutDataType, F8> || is_same_v<OutDataType, F16> ||
+                      is_same_v<OutDataType, BF16> || is_same_v<OutDataType, F32> ||
+                      is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
+                      is_same_v<OutDataType, int>,
+                  "Warning: Unhandled OutDataType for setting up the absolute threshold!");
+    double output_error = 0;
+    if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
+                 is_same_v<OutDataType, int>)
+    {
+        return 0;
+    }
+    else
+    {
+        output_error = std::pow(2, expo - NumericUtils<OutDataType>::mant) * 0.5;
+    }
+    double midway_error = std::max(compute_error, output_error);
+
+    static_assert(is_same_v<AccDataType, F8> || is_same_v<AccDataType, F16> ||
+                      is_same_v<AccDataType, BF16> || is_same_v<AccDataType, F32> ||
+                      is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
+                      is_same_v<AccDataType, int>,
+                  "Warning: Unhandled AccDataType for setting up the absolute threshold!");
+    double acc_error = 0;
+    if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
+                 is_same_v<AccDataType, int>)
+    {
+        return 0;
+    }
+    else
+    {
+        acc_error =
+            std::pow(2, expo - NumericUtils<AccDataType>::mant) * 0.5 * numberOfAccumulations;
+    }
+    return std::max(acc_error, midway_error);
+}
+
 template <typename Range, typename RefRange>
 typename std::enable_if<
     std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
@@ -253,11 +377,13 @@ check_err(const Range& out,
     int err_count  = 0;
     double err     = 0;
     double max_err = std::numeric_limits<float>::min();
+
     for(std::size_t i = 0; i < ref.size(); ++i)
     {
         const double o = type_convert<float>(*std::next(std::begin(out), i));
         const double r = type_convert<float>(*std::next(std::begin(ref), i));
         err            = std::abs(o - r);
+
         if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
         {
             max_err = err > max_err ? err : max_err;
@@ -270,6 +396,7 @@ check_err(const Range& out,
             res = false;
         }
     }
+
     if(!res)
     {
         std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
index 3bdaa5c83..a0890028a 100644
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -102,11 +102,22 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
     Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
         f_host_tensor_descriptor(N, C, Do, Ho, Wo));
 
+    constexpr int inDataRangeTensor1{1};
+    constexpr int inDataRangeTensor2{5};
+    constexpr double inDataRangeTensor3{0.5};
+
     switch(in_params.init_method)
     {
-    case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
-    case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
-    default: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    case 0:
+        in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{inDataRangeTensor1});
+        break;
+    case 1:
+        in_n_c_di_hi_wi.GenerateTensorValue(
+            GeneratorTensor_2<InDataType>{-inDataRangeTensor2, inDataRangeTensor2});
+        break;
+    default:
+        in_n_c_di_hi_wi.GenerateTensorValue(
+            GeneratorTensor_3<InDataType>{-inDataRangeTensor3, inDataRangeTensor3});
     }
 
     DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_di_hi_wi.mDesc.GetElementSpaceSize());
@@ -229,12 +240,25 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
         {
             out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
 
-            auto tolerance = 1e-3;
-            bool pass      = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
+            auto absolute_error_threshold = 1.0;
+            switch(in_params.init_method)
+            {
+            case 0: absolute_error_threshold = static_cast<double>(inDataRangeTensor1); break;
+            case 1: absolute_error_threshold = static_cast<double>(inDataRangeTensor2); break;
+            default: absolute_error_threshold = inDataRangeTensor3;
+            }
+
+            absolute_error_threshold =
+                ck::utils::get_absolute_threshold<ComputeDataType, OutDataType>(
+                    absolute_error_threshold);
+            auto relative_error_threshold =
+                ck::utils::get_relative_threshold<ComputeDataType, OutDataType>();
+
+            bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
                                              out_n_c_do_ho_wo_host.mData,
                                              "Error: Incorrect results",
-                                             tolerance,
-                                             tolerance);
+                                             relative_error_threshold,
+                                             absolute_error_threshold);
 
             if constexpr(OutputIndex)
             {
-- 
GitLab


From 7d576f1748eca6f02f5ab3e0a860ed3cb3a9c6d8 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Fri, 25 Oct 2024 10:13:46 -0500
Subject: [PATCH 015/153] Update GPU verification (#1596)

* Update inits

* Update static_cast to type_convert

* Add verification option selection
---
 example/01_gemm/common.hpp                    | 15 ++++++-----
 example/01_gemm/run_gemm_example.inc          | 27 ++++++++++---------
 .../01_gemm/run_gemm_example_streamk_v2.inc   |  2 +-
 example/01_gemm/run_gemm_example_v2.inc       |  2 +-
 .../gpu/reference_gemm.hpp                    | 10 +++----
 5 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index d08196924..6e1c9f2a0 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -75,9 +75,10 @@ struct ProblemSizeSplitK final
 
 struct ExecutionConfig final
 {
-    bool do_verification = true;
-    int init_method      = 2;
-    bool time_kernel     = false;
+    // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
+    int do_verification = 3;
+    int init_method     = 2;
+    bool time_kernel    = false;
 };
 
 template <ck::index_t... Is>
@@ -126,7 +127,7 @@ bool parse_cmd_args<ProblemSize>(int argc,
     }
     else
     {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                   << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                   << std::endl
                   << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -176,7 +177,7 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
     else
     {
         std::cerr
-            << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+            << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
             << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
             << "arg3: time kernel (0=no, 1=yes)" << std::endl
             << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
@@ -225,7 +226,7 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
     }
     else
     {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                   << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                   << std::endl
                   << "arg3: time kernel (0=no, 1=yes)" << std::endl
@@ -275,7 +276,7 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
     }
     else
     {
-        std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
+        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
                   << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
                   << std::endl
                   << "arg3: time kernel (0=no, 1=yes)" << std::endl
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index fe12998e3..bafec3f35 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -330,7 +330,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
     bool pass = true;
 
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
     {
         // CPU verification
         auto ref_gemm    = ReferenceGemmInstance{};
@@ -353,13 +353,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
         c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
 
-        pass &= !ck::utils::check_err(c_m_n_device_result,
-                                      c_m_n_host_result,
-                                      "Error: Incorrect results!",
-                                      get_rtol<CDataType>(),
-                                      get_atol<CDataType>());
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
 #endif
+    }
 
+    if((config.do_verification == 2) || (config.do_verification == 3))
+    {
         // GPU verification
         auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
         auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
@@ -381,14 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
         c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
 
-        pass &= !ck::utils::check_err(c_m_n_device_result,
-                                      c_m_n_device_ref_result,
-                                      "Error: Incorrect results!",
-                                      get_rtol<CDataType>(),
-                                      get_atol<CDataType>());
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_device_ref_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
     }
 
-    return !pass;
+    return pass == true;
 }
 
 bool run_gemm_example(int argc, char* argv[])
diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc
index 6679f9515..8ed8b81be 100644
--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -241,7 +241,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     }
 
     bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
     {
         auto ref_gemm    = ReferenceGemmInstance{};
         auto ref_invoker = ref_gemm.MakeInvoker();
diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc
index 0bcee658b..71524fdec 100644
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     }
 
     bool pass = true;
-    if(config.do_verification)
+    if((config.do_verification == 1) || (config.do_verification == 3))
     {
         auto ref_gemm    = ReferenceGemmInstance{};
         auto ref_invoker = ref_gemm.MakeInvoker();
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
index 639b5fe80..2c2cac77e 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
@@ -45,10 +45,10 @@ __global__ void
     if(row_idx < m && col_idx < n)
     {
 
-        AccDataType v_acc = static_cast<AccDataType>(0.0);
-        ComputeTypeA v_a  = static_cast<ComputeTypeA>(0.0);
-        ComputeTypeB v_b  = static_cast<ComputeTypeB>(0.0);
-        CDataType v_c     = static_cast<CDataType>(0.0);
+        AccDataType v_acc{0};
+        ComputeTypeA v_a{0};
+        ComputeTypeB v_b{0};
+        CDataType v_c{0};
 
         for(int k_idx = 0; k_idx < k; ++k_idx)
         {
@@ -76,7 +76,7 @@ __global__ void
             // apply b_element_op
             b_element_op(v_b, p_b_grid[element_idx_b]);
             // multiply and accumulate
-            v_acc += static_cast<AccDataType>(v_a) * static_cast<AccDataType>(v_b);
+            v_acc += type_convert<AccDataType>(v_a) * type_convert<AccDataType>(v_b);
         }
         // apply c_element_op
         c_element_op(v_c, v_acc);
-- 
GitLab


From eda593838621984ea008a783ca0093350a7bf60e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 18 Oct 2024 17:09:12 +0000
Subject: [PATCH 016/153] add parsing grouped conv fwd instances

---
 .../grouped_conv_fwd/gen_instances.py         | 167 ++++++++++++++++++
 python/ck4inductor/grouped_conv_fwd/op.py     |  93 ++++++++++
 .../universal_gemm/gen_instances.py           |   5 +-
 python/ck4inductor/universal_gemm/op.py       |   3 +
 python/ck4inductor/util.py                    |   5 +-
 5 files changed, 271 insertions(+), 2 deletions(-)
 create mode 100644 python/ck4inductor/grouped_conv_fwd/gen_instances.py
 create mode 100644 python/ck4inductor/grouped_conv_fwd/op.py

diff --git a/python/ck4inductor/grouped_conv_fwd/gen_instances.py b/python/ck4inductor/grouped_conv_fwd/gen_instances.py
new file mode 100644
index 000000000..ffbea6bdc
--- /dev/null
+++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+import logging
+import os
+import subprocess
+from dataclasses import replace
+from functools import lru_cache
+from typing import List
+
+from ..util import library_path
+
+from .op import CKGroupedConvFwdOp
+
+log = logging.getLogger(__name__)
+
+
+def _ck_conv_instances_path():
+    conv_instances_path = os.path.join(  # noqa: F821
+        library_path(),
+        "include",
+        "ck",
+        "library",
+        "tensor_operation_instance",
+        "gpu",
+        "grouped_conv_fwd",
+    )
+    if not os.path.exists(conv_instances_path):
+        log.error(
+            "CK library conv instances path %s does not exist", conv_instances_path
+        )
+        return None
+    return conv_instances_path
+
+
+def parse_instances(str_instances: List[str]) -> List[CKGroupedConvFwdOp]:
+    """
+    Parse the lines containing Grouped Convolution Forward template instances
+    into `CKGroupedConvFwdOp` instances
+    """
+
+    def maybe_int(s):
+        try:
+            return int(s)
+        except ValueError:
+            return s
+
+    op_instances = []
+    # TODO: maybe use libclang for parsing C++ code in the future
+    # to avoid this hacky parsing logic below ? :) - copilot
+    for line in str_instances:
+        s_template_args = line.split("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")[
+            -1
+        ].strip("<>, ")
+        template_args = []
+        i_current = 0
+        while i_current < len(s_template_args):
+            if s_template_args[i_current] == " ":
+                # skip whitespace
+                i_current += 1
+                continue
+            elif s_template_args[i_current : i_current + 2] == "S<":
+                # parse template S<Index...>
+                i_next = s_template_args.find(">", i_current)
+                template_args.append(
+                    tuple(map(int, s_template_args[i_current + 2 : i_next].split(",")))
+                )
+                i_current = i_next + 2
+            else:
+                # all string attributes must be either type aliases or global constants in C++
+                i_next = s_template_args.find(",", i_current)
+                template_args.append(
+                    maybe_int(
+                        s_template_args[i_current : i_next if i_next != -1 else None]
+                    )
+                )
+                if i_next != -1:
+                    i_current = i_next + 1
+            if i_next == -1:
+                break
+
+        template_args[0] = -1  # n_dim_spatial
+        template_args[3] = tuple()  # ds_layout
+        template_args[9] = tuple()  # ds_element_dtype
+
+        new_instance = CKGroupedConvFwdOp(
+            *template_args,  # type: ignore[arg-type]
+        )
+
+        op_instances.append(new_instance)
+    return op_instances
+
+
+@lru_cache(None)
+def gen_conv_ops_library() -> List[CKGroupedConvFwdOp]:
+    """
+    Parse the Grouped Convolution Forward instances
+    defined in the Composable Kernel library folder.
+    """
+    ck_library_dir = _ck_conv_instances_path()
+    if not ck_library_dir:
+        return []
+
+    grep_result = subprocess.run(
+        [
+            "grep",
+            "-inR",
+            "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3",
+            ck_library_dir,
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    op_instances = parse_instances(grep_result.stdout.strip().split("\n"))
+
+    log.debug("ck instances from library: %d", len(op_instances))
+
+    schedulers = [
+        "BlockGemmPipelineScheduler::Intrawave",
+        "BlockGemmPipelineScheduler::Interwave",
+    ]
+    conv_specs = [
+        "ConvolutionForwardSpecialization::Default",
+        "ConvolutionForwardSpecialization::Filter1x1Pad0",
+        "ConvolutionForwardSpecialization::Filter1x1Stride1Pad0",
+        "ConvolutionForwardSpecialization::OddC",
+    ]
+
+    # substitute templated args by looping through their domains
+    substitute_instances = []
+    for instance in op_instances:
+        sub_scheduler = (
+            instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
+        )
+        sub_spec = instance.conv_forward_specialization == "ConvSpec"
+        schedulers_range = (
+            schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler]
+        )
+        spec_range = conv_specs if sub_spec else [instance.conv_forward_specialization]
+        for scheduler in schedulers_range:
+            for spec in spec_range:
+                for channels_last in [True, False]:
+                    if channels_last:
+                        a_layout = "NHWGC"
+                        e_layout = "NHWGK"
+                    else:
+                        a_layout = "NGCHW"
+                        e_layout = "NGKHW"
+                    substitute_instances.append(
+                        replace(
+                            instance,
+                            block_gemm_pipeline_scheduler=scheduler,
+                            conv_forward_specialization=spec,
+                            gemm_specialization="GemmSpecialization::MNKPadding",
+                            n_dim_spatial=2,
+                            a_layout=a_layout,
+                            b_layout="GKYXC",
+                            e_layout=e_layout,
+                        )
+                    )
+
+    return substitute_instances
+
+
+if __name__ == "__main__":
+    print(gen_conv_ops_library())
diff --git a/python/ck4inductor/grouped_conv_fwd/op.py b/python/ck4inductor/grouped_conv_fwd/op.py
new file mode 100644
index 000000000..25d45e8ff
--- /dev/null
+++ b/python/ck4inductor/grouped_conv_fwd/op.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+from dataclasses import asdict, dataclass
+from typing import Optional, Tuple
+
+
+@dataclass
+class CKGroupedConvFwdOp:
+    n_dim_spatial: int
+    a_layout: str
+    b_layout: str
+    ds_layout: Tuple[str]
+    e_layout: str
+    a_element_dtype: str
+    b_element_dtype: str
+    acc_dtype: str
+    c_shuffle_dtype: str
+    ds_element_dtype: Tuple[str]
+    e_element_dtype: str
+    a_elementwise_op: str
+    b_elementwise_op: str
+    cde_elementwise_op: str
+    conv_forward_specialization: str
+    gemm_specialization: str
+
+    block_size: int
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+    ak1: int
+    bk1: int
+    m_per_xdl: int
+    n_per_xdl: int
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+    a_block_transfer_thread_cluster_lengths_ak0_m_ak1: Tuple[int, int, int]
+    a_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
+    a_block_transfer_src_access_order: Tuple[int, int, int]
+    a_block_transfer_src_vector_dim: int
+    a_block_transfer_src_scalar_per_vector: int
+    a_block_transfer_dst_scalar_per_vector_ak1: int
+    a_block_lds_extra_m: bool
+
+    b_block_transfer_thread_cluster_lengths_bk0_n_bk1: Tuple[int, int, int]
+    b_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
+    b_block_transfer_src_access_order: Tuple[int, int, int]
+
+    b_block_transfer_src_vector_dim: int
+    b_block_transfer_src_scalar_per_vector: int
+    b_block_transfer_dst_scalar_per_vector_bk1: int
+    b_block_lds_extra_n: bool
+
+    c_shuffle_m_xdl_per_wave_per_shuffle: int
+    c_shuffle_n_xdl_per_wave_per_shuffle: int
+    cde_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block: Tuple[  # noqa
+        int,
+        int,
+        int,
+        int,
+    ]
+    cde_block_transfer_scalar_per_vector_n_per_block: int
+    block_gemm_pipeline_scheduler: str
+    block_gemm_pipeline_version: str
+
+    a_compute_dtype: Optional[str] = None
+    b_compute_dtype: Optional[str] = None
+
+    def name(self):
+        # cpp alias for template instance
+        return (
+            f"ck_device_grouped_convolution_fwd_multiple_abd_xdl_c_shuffle_v3_"
+            f"{self.key_name()}"
+        )
+
+    def key_name(self):
+        # TBD; must be unique per instance. Intended to use as dict key
+        return "_".join(
+            [
+                "K"
+                + field_name.replace("_", "").lower()
+                + "V"
+                + (
+                    "x".join(map(str, iter(field_value)))
+                    if isinstance(field_value, tuple)
+                    else str(field_value).replace(":", "")
+                )
+                for field_name, field_value in self.dict_items()
+            ]
+        )
+
+    def dict_items(self):
+        return asdict(self).items()
diff --git a/python/ck4inductor/universal_gemm/gen_instances.py b/python/ck4inductor/universal_gemm/gen_instances.py
index 5594b8681..24bab5477 100644
--- a/python/ck4inductor/universal_gemm/gen_instances.py
+++ b/python/ck4inductor/universal_gemm/gen_instances.py
@@ -1,7 +1,10 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
 import logging
 import os
 import subprocess
-from dataclasses import fields, replace
+from dataclasses import replace
 from functools import lru_cache, partial
 from typing import List
 
diff --git a/python/ck4inductor/universal_gemm/op.py b/python/ck4inductor/universal_gemm/op.py
index a8bb72500..946aaa7af 100644
--- a/python/ck4inductor/universal_gemm/op.py
+++ b/python/ck4inductor/universal_gemm/op.py
@@ -1,3 +1,6 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
 from dataclasses import asdict, dataclass
 from typing import Optional, Tuple
 
diff --git a/python/ck4inductor/util.py b/python/ck4inductor/util.py
index 79d6be00f..4d7e8bd87 100644
--- a/python/ck4inductor/util.py
+++ b/python/ck4inductor/util.py
@@ -1,7 +1,10 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
 import functools
 import os
 
 
 @functools.lru_cache(None)
 def library_path():
-    return os.path.join(os.path.dirname(__file__), 'library')
+    return os.path.join(os.path.dirname(__file__), "library")
-- 
GitLab


From 37f7afed1e2a19be8c04b7cd26d07db41c082e88 Mon Sep 17 00:00:00 2001
From: valarLip <103567126+valarLip@users.noreply.github.com>
Date: Sat, 26 Oct 2024 16:39:34 +0800
Subject: [PATCH 017/153] add int8 gemm multiply multiply a8w8 (#1591)

* add int8 gemm multiply multiply a8w8

* uncomment

* clang-format-12

* Add example_gemm_multiply_multiply_xdl_int8

* Remove shell scripts

* update preprocess number for mi308; bring back printout in ckprofiler

* format

---------

Co-authored-by: chenjun <junchen2@amd.com>
Co-authored-by: Haocong WANG <haocwang@amd.com>
Co-authored-by: carlushuang <carlus.huang@amd.com>
---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   1 +
 .../gemm_multiply_multiply_xdl_int8.cpp       | 304 ++++++++++++++++++
 include/ck/host_utility/flush_cache.hpp       |  55 +++-
 .../gpu/element/element_wise_operation.hpp    |  20 ++
 include/ck/utility/amd_xdlops.hpp             |  12 +-
 .../gpu/gemm_multiply_multiply.hpp            | 105 ++++++
 .../gpu/gemm_multiply_multiply/CMakeLists.txt |  10 +
 ...tiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp |  99 ++++++
 ...i8_bf16_mk_nk_mn_comp_default_instance.cpp |  32 ++
 ...8_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  32 ++
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |  33 ++
 ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  33 ++
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |  33 ++
 ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  33 ++
 .../profile_gemm_multiply_multiply_impl.hpp   |  10 +-
 .../src/profile_gemm_multiply_multiply.cpp    |  10 +-
 16 files changed, 794 insertions(+), 28 deletions(-)
 create mode 100644 example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index d39114013..55c884246 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_multiply_xdl_fp8.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp)
 add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp)
+add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp)
\ No newline at end of file
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
new file mode 100644
index 000000000..fb1642bba
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int;
+using F16 = ck::half_t;
+using FP8 = ck::f8_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = I8;
+using B0DataType       = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F16;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using D0Layout = Row;
+using D1Layout = Col;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, float, float, float>(
+        ck::half_t& e, const float& c, const float& d0, const float& d1) const
+    {
+        const float x0_f = c * d0 * d1;
+
+        e = ck::type_convert<ck::half_t>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, int, float, float>(
+        ck::half_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::half_t>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::bhalf_t, int, float, float>(
+        ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::bhalf_t>(x0_f);
+    }
+};
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MultiplyMultiply;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|       Type|      Type|        Type|         DataType| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
+///###### RRR
+      ///<      Row,      Row, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   256,   128,    64,  16,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>;
+///###### RCR
+         <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<4, 64, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 32, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideD = 0;
+    ck::index_t StrideE = N;
+
+    ck::index_t KBatch = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        KBatch = std::stoi(argv[11]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf(
+            "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n");
+        exit(0);
+    }
+    do_verification = false;
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{0, 2});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{0, 2});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{0, 2});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+
+    constexpr auto I0 = ck::Number<0>{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, NumDTensor>{I0, I0},
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false});
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B0DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp
index 63fa365cc..918fb28ea 100644
--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -237,7 +237,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
                                              Args... args)
 {
 #if CK_TIME_KERNEL
-#define MEDIAN 1
+#define MEDIAN 0
     if(stream_config.time_kernel_)
     {
         if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
@@ -275,6 +275,14 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
 #else
         float total_time = 0;
 #endif
+        hipEvent_t start, stop;
+
+        hip_check_error(hipEventCreate(&start));
+        hip_check_error(hipEventCreate(&stop));
+
+        hip_check_error(hipDeviceSynchronize());
+        hip_check_error(hipEventRecord(start, stream_config.stream_id_));
+
         for(int i = 0; i < nrepeat; ++i)
         {
             if constexpr(!TimePreprocess)
@@ -282,13 +290,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
                 preprocess();
             }
 
-            hipEvent_t start, stop;
+            // hipEvent_t start, stop;
 
-            hip_check_error(hipEventCreate(&start));
-            hip_check_error(hipEventCreate(&stop));
+            // hip_check_error(hipEventCreate(&start));
+            // hip_check_error(hipEventCreate(&stop));
 
-            hip_check_error(hipDeviceSynchronize());
-            hip_check_error(hipEventRecord(start, stream_config.stream_id_));
+            // hip_check_error(hipDeviceSynchronize());
+            // hip_check_error(hipEventRecord(start, stream_config.stream_id_));
             // calculate preprocess time
             if constexpr(TimePreprocess)
             {
@@ -299,25 +307,34 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
             hip_check_error(hipGetLastError());
             // end real kernel
 
-            hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
-            hip_check_error(hipEventSynchronize(stop));
-            float cur_time = 0;
-            hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
-#if MEDIAN
-            times.insert(cur_time);
-#else
-            total_time += cur_time;
-#endif
+            //             hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
+            //             hip_check_error(hipEventSynchronize(stop));
+            //             float cur_time = 0;
+            //             hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
+            // #if MEDIAN
+            //             times.insert(cur_time);
+            // #else
+            //             total_time += cur_time;
+            // #endif
 
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
-                std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
+                // std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
 
                 printf("gemm_args.p_a_grid: %p, gemm_args.p_b_grid:%p\n",
                        static_cast<const void*>(gemm_args.p_a_grid),
                        static_cast<const void*>(gemm_args.p_b_grid));
             }
         }
+        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
+        hip_check_error(hipEventSynchronize(stop));
+        float cur_time = 0;
+        hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
+#if MEDIAN
+        times.insert(cur_time);
+#else
+        total_time += cur_time;
+#endif
 
 #if MEDIAN
         auto mid = times.begin();
@@ -333,7 +350,11 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
             return (*mid + *mid_next) / 2;
         }
 #else
-        return total_time / nrepeat;
+        // return total_time / nrepeat;
+        hipDeviceProp_t deviceProps;
+        hip_check_error(hipGetDeviceProperties(&deviceProps, 0));
+        float preprocess_offset = deviceProps.multiProcessorCount == 80 ? 0.005 : 0.01;
+        return (total_time - preprocess_offset * nrepeat) / nrepeat;
 #endif
     }
     else
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 9c60121c8..135eaec93 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -272,6 +272,26 @@ struct MultiplyMultiply
 
         e = ck::type_convert<ck::bhalf_t>(x0_f);
     }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, int, ck::half_t, ck::half_t>(
+        ck::half_t& e, const int& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::half_t>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::bhalf_t, int, float, float>(
+        ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::bhalf_t>(x0_f);
+    }
 };
 
 struct MultiplyAddFastGelu
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index d8ccb2ea7..a955279bc 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -327,12 +327,12 @@ struct intrin_mfma_i32_16x16x32i8<16, 16>
     __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
     {
         reg_c.template AsType<int32x4_t>()(Number<0>{}) =
-            __builtin_amdgcn_mfma_i32_16x16x32i8(bit_cast<int64_t>(reg_a),
-                                                 bit_cast<int64_t>(reg_b),
-                                                 reg_c.template AsType<int32x4_t>()[Number<0>{}],
-                                                 0,
-                                                 0,
-                                                 0);
+            __builtin_amdgcn_mfma_i32_16x16x32_i8(bit_cast<int64_t>(reg_a),
+                                                  bit_cast<int64_t>(reg_b),
+                                                  reg_c.template AsType<int32x4_t>()[Number<0>{}],
+                                                  0,
+                                                  0,
+                                                  0);
     }
 };
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
index 2077f904d..b6aa61277 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
@@ -96,6 +96,87 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_i
                                                           MultiplyMultiply>>>& instances);
 #endif
 
+#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_INT8))
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+#endif
+
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
@@ -155,6 +236,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
             }
         }
+#endif
+#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_INT8))
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+            }
+        }
 #endif
         return op_ptrs;
     }
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
index 5e56aebcf..0107c3dec 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
@@ -8,9 +8,19 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_INSTANCES
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
         )
 
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
+set_source_files_properties(device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 add_instance_library(device_gemm_multiply_multiply_instance ${GEMM_MULTIPLY_MULTIPLY_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp
new file mode 100644
index 000000000..2d4c37199
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I8   = int8_t;
+using I32  = int;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough      = element_wise::PassThrough;
+using MultiplyMultiply = element_wise::MultiplyMultiply;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,    64,  16,  16,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   256,    64,  16,  16,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    64,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,   128,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    I8,    I8,    Tuple<F32, F32>, BF16,  I32,     I32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    64,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, I8>
+    // clang-format oI
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|       Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |                |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly 
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, I8>,
+        // Memory friendly
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,   256,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,   256,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,   128,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,   128,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    64,   32,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    64,   16,    128, 16,  16,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,    64,    16,   16,     64, 16,  16,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    16,   64,    128, 16,  16,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,   64,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<8, 8, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    16,  128,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,  128,    128, 16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<8, 8, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    16,  256,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     I8,     I8,    Tuple<F32, F32>, BF16,   I32,     I32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    32,  256,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, I8>
+    // clang-format oI
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 000000000..09ee08dd6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 000000000..e18262108
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100644
index 000000000..173bd4dcb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 000000000..6aa427433
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..5797f0c8b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 000000000..7dc8440bf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_i8_i8_bf16/device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          I8,
+                                                          I8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
index 7dd7b041e..29a645e9d 100644
--- a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
@@ -271,10 +271,12 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
                           << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
                           << kbatch_curr << std::endl;
 
-#if defined CK_ENABLE_FP8
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_INT8
                 // set softer tolerances for fp8
-                if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
-                             is_same_v<EDataType, f8_t>)
+                if constexpr((is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                              is_same_v<EDataType, f8_t>) ||
+                             (is_same_v<ADataType, int8_t> || is_same_v<BDataType, int8_t> ||
+                              is_same_v<EDataType, int8_t>))
                 {
                     std::string msg = "Error: Incorrect results!";
                     double rtol     = 1e-1;
@@ -286,7 +288,7 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
                 {
 #endif
                     pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
-#if defined CK_ENABLE_FP8
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_INT8
                 }
 #endif
 
diff --git a/profiler/src/profile_gemm_multiply_multiply.cpp b/profiler/src/profile_gemm_multiply_multiply.cpp
index b7e80ed79..df87cc815 100644
--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
@@ -27,6 +27,7 @@ enum struct GemmDataType
     F16_F8_F16,     // 5
     F16_F16_F16_F8, // 6
     F8_F8_BF16,     // 7
+    INT8_INT8_BF16, // 8
 };
 
 #define OP_NAME "gemm_multiply_multiply"
@@ -39,7 +40,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
                "f16->f8; 7: f8->bf16, "
-               "comp f8)\n");
+               "comp f8; 8: int8->bf16)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -89,6 +90,8 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
     using F32  = float;
     using BF16 = ck::bhalf_t;
     using F8   = ck::f8_t;
+    using I8   = int8_t;
+    using I32  = int;
 
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -162,6 +165,11 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
         return profile(
             F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
     }
+    else if(data_type == GemmDataType::INT8_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(
+            I8{}, I8{}, I8{}, I32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
+    }
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
-- 
GitLab


From 54f0e6f4bb37f574b703ee22d069d773c0d95dfd Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Sat, 26 Oct 2024 18:35:45 +0800
Subject: [PATCH 018/153] [CK_TILE] More fmha splitkv optimizations (#1588)

* Use pre-defined constants for readability

* Use vector write for o_acc tensor

* Remove no-longer used policy method

* Deprecate no-longer used policy/pipeline

* Specify gemm0/gemm1 block warps separately in codegen

* Fix wrong ps_idx creation logic

* Add single-warp block gemm

* Supoprt single-warp gemm0

* Make MakeCBlockTile() as static method

* Use MakeCBlockTile() to get underlying tile distribution

* Use kNumGemm1Warps to compute # threads for gemm1

* Put normal case in the if clause

* Refine fmha splitkv block mapping

* Refine & fix the lse_acc/o_acc layout

* Fix wrong LDS size for K tile

* Use kK0=64 for hdim=128,256 fmha splitkv kernels

* Use kK1=64 for hdim=32,64,128 fmha splitkv kernels

* Undo kK0/kK1 changes

* Use more reasonable GetAlignmentV() computation

* Using store_tile() in fmha splitkv kernel epilogue
---
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  41 +--
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |  32 +--
 example/ck_tile/01_fmha/fmha_fwd.cpp          |  64 ++---
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |   6 +-
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |   8 +-
 .../fmha_fwd_splitkv_tile_partitioner.hpp     |   9 +-
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp |  13 +-
 ...litkv_pipeline_qr_ks_vs_default_policy.hpp |  21 +-
 .../pipeline/block_fmha_pipeline_problem.hpp  |  14 +-
 .../pipeline/block_fmha_pipeline_qr_ks_vs.hpp |  10 +-
 .../block_fmha_pipeline_qr_ks_vs_async.hpp    |  10 +-
 .../pipeline/block_fmha_pipeline_qs_ks_vs.hpp |   3 +-
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 131 ++++------
 .../ops/fmha/pipeline/tile_fmha_shape.hpp     |   9 +-
 include/ck_tile/ops/gemm.hpp                  |   1 +
 .../block/block_gemm_areg_breg_creg_v1.hpp    |   2 +-
 ...block_gemm_areg_bsmem_creg_one_warp_v1.hpp | 237 ++++++++++++++++++
 .../block/block_gemm_areg_bsmem_creg_v1.hpp   |   2 +-
 .../block/block_gemm_areg_bsmem_creg_v2.hpp   |   2 +-
 .../block/block_gemm_asmem_breg_creg_v1.hpp   |   2 +-
 .../block/block_gemm_asmem_bsmem_creg_v1.hpp  |   2 +-
 .../ck_tile/ops/reduce/block/block_reduce.hpp |   2 +-
 22 files changed, 422 insertions(+), 199 deletions(-)
 create mode 100644 include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 860ee20d3..805803fed 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -36,13 +36,12 @@ FMHA_FWD_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
 
 using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>;
-using fmha_block_warps_{F_idx} = ck_tile::sequence<{F_rm}, {F_rn}, {F_rk}>;
 using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 
 using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
-                                      fmha_block_warps_{F_idx},
+                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
                                       fmha_warp_tile_{F_idx},
-                                      fmha_block_warps_{F_idx},
+                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
                                       fmha_warp_tile_{F_idx},
                                       {F_vlayout}>;
 
@@ -291,9 +290,12 @@ class FmhaFwdTileSize:
     F_bn1       : int  # tile size along v head_dim
     F_bk1       : int  # tile size along kv gemm unroll
     F_bk0blen   : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
-    F_rm        : int  # number of warps along q seqlen (block warps)
-    F_rn        : int  # number of warps along k seqlen(not used)
-    F_rk        : int  # number of warps along gemm-k(not used)
+    F_rm0       : int  # number of warps for gemm0 along q seqlen
+    F_rn0       : int  # number of warps for gemm0 along k seqlen 
+    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1       : int  # number of warps for gemm1 along q seqlen
+    F_rn1       : int  # number of warps for gemm1 along head dim v
+    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
     F_wm        : int  # warp size along m (warp size)
     F_wn        : int  # warp size along n
     F_wk        : int  # warp size along k
@@ -301,8 +303,8 @@ class FmhaFwdTileSize:
     @property
     def name(self) -> str:
         return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0blen}" +\
-        f"_r{self.F_rm}x{self.F_rn}x{self.F_rk}_w{self.F_wm}x{self.F_wn}x{self.F_wk}" +\
-            ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
+        f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
 
 @dataclass
 class FmhaFwdKernel:
@@ -334,9 +336,12 @@ class FmhaFwdKernel:
                 F_bn1           = self.F_tile.F_bn1,
                 F_bk1           = self.F_tile.F_bk1,
                 F_bk0blen       = self.F_tile.F_bk0blen,
-                F_rm            = self.F_tile.F_rm,
-                F_rn            = self.F_tile.F_rn,
-                F_rk            = self.F_tile.F_rk,
+                F_rm0           = self.F_tile.F_rm0,
+                F_rn0           = self.F_tile.F_rn0,
+                F_rk0           = self.F_tile.F_rk0,
+                F_rm1           = self.F_tile.F_rm1,
+                F_rn1           = self.F_tile.F_rn1,
+                F_rk1           = self.F_tile.F_rk1,
                 F_wm            = self.F_tile.F_wm,
                 F_wn            = self.F_tile.F_wn,
                 F_wk            = self.F_tile.F_wk,
@@ -394,16 +399,16 @@ class FmhaFwdKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1, 32, 32, 16, -1),
-            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1, 32, 32, 16, -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1, 32, 32, 16, -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1, 32, 32, 16, -1),
+            '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1,  2, 1, 1,  32, 32, 16, -1),
+            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     2, 1, 1, 32, 32, 32, -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1, 32, 32, 32, -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1, 32, 32, 32, -1)
+            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     2, 1, 1,  2, 1, 1,  32, 32, 32, -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1)
         }
     else:
         return None
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 57360ea99..46c26b22c 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -42,13 +42,12 @@ namespace {{
 template <bool kHasUnevenSplits>
 struct kernel_runner {{
 using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>;
-using fmha_block_warps = ck_tile::sequence<{F_rm}, {F_rn}, {F_rk}>;
 using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 
 using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
-                                          fmha_block_warps,
+                                          ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
                                           fmha_warp_tile,
-                                          fmha_block_warps,
+                                          ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
                                           fmha_warp_tile,
                                           {F_vlayout}>;
 
@@ -162,10 +161,12 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem<
 using fmha_pipeline = ck_tile::BlockFmhaFwdSplitKVCombinePipeline<
     fmha_pipeline_problem>;
 
+/// FIXME: use {F_spad}/{F_dvpad} as kPadM/kPadN parameters after solving
+///        store_tile_raw() data corruption issue
 using fmha_epilogue =
     ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
                                            typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
-                                           {F_spad}, {F_dvpad}>>;
+                                           false, false>>;
 
 using fmha_kernel =
     ck_tile::FmhaFwdSplitKVCombineKernel<ck_tile::FmhaFwdSplitKVCombineTilePartitioner<{F_bm0}, {F_bn1}>,
@@ -458,9 +459,12 @@ class FmhaFwdSplitKVKernel:
                 F_bn1           = self.F_tile.F_bn1,
                 F_bk1           = self.F_tile.F_bk1,
                 F_bk0blen       = self.F_tile.F_bk0blen,
-                F_rm            = self.F_tile.F_rm,
-                F_rn            = self.F_tile.F_rn,
-                F_rk            = self.F_tile.F_rk,
+                F_rm0           = self.F_tile.F_rm0,
+                F_rn0           = self.F_tile.F_rn0,
+                F_rk0           = self.F_tile.F_rk0,
+                F_rm1           = self.F_tile.F_rm1,
+                F_rn1           = self.F_tile.F_rn1,
+                F_rk1           = self.F_tile.F_rk1,
                 F_wm            = self.F_tile.F_wm,
                 F_wn            = self.F_tile.F_wn,
                 F_wk            = self.F_tile.F_wk,
@@ -553,16 +557,16 @@ class FmhaFwdSplitKVCombineKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32, 32,   2, 1, 1, 16, 16, 16, -1),
-            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32, 64,   4, 1, 1, 16, 16, 16, -1),
-            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32, 128,  4, 1, 1, 16, 16, 16, -1),
-            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32, 256,  4, 1, 1, 16, 16, 16, -1),
+            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16, -1),
+            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32, 64,   2, 1, 1, 32, 32, 32, -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1, 32, 32, 32, -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1, 32, 32, 32, -1)
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32, -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1)
         }
     else:
         return None
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 6d519a7ea..14291715f 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -557,33 +557,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 #endif
 
-    struct
-    {
-        auto operator()(bool permute,
-                        ck_tile::index_t b /*batch*/,
-                        ck_tile::index_t h /*nhead*/,
-                        ck_tile::index_t s /*seqlen*/,
-                        ck_tile::index_t d /*hdim*/)
-        {
-            if(permute)
-                return std::array<ck_tile::index_t, 4>{b, h, s, d};
-            else
-                return std::array<ck_tile::index_t, 4>{b, s, h, d};
-        }
-
-        auto operator()(bool permute,
-                        ck_tile::index_t ns /*num_splits*/,
-                        ck_tile::index_t b /*batch*/,
-                        ck_tile::index_t h /*nhead*/,
-                        ck_tile::index_t s /*seqlen*/,
-                        ck_tile::index_t d /*hdim*/)
-        {
-            if(permute)
-                return std::array<ck_tile::index_t, 5>{ns, b, h, s, d};
-            else
-                return std::array<ck_tile::index_t, 5>{ns, b, s, h, d};
-        }
-    } get_lengths;
+    static const auto get_lengths = [](bool permute,
+                                       ck_tile::index_t b /*batch*/,
+                                       ck_tile::index_t h /*nhead*/,
+                                       ck_tile::index_t s /*seqlen*/,
+                                       ck_tile::index_t d /*hdim*/) {
+        if(permute)
+            return std::array<ck_tile::index_t, 4>{b, h, s, d};
+        else
+            return std::array<ck_tile::index_t, 4>{b, s, h, d};
+    };
 
     bool is_v_rowmajor = vlayout == std::string("r");
 
@@ -635,12 +618,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::HostTensor<LSEDataType> lse_acc_host(
         1 < num_splits || use_kvcache
-            ? std::array<ck_tile::index_t, 4>{num_splits, shape_batch, nhead, shape_seqlen_q}
+            ? std::array<ck_tile::index_t, 4>{shape_batch, nhead, num_splits, shape_seqlen_q}
             : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
     ck_tile::HostTensor<OaccDataType> o_acc_host(
-        1 < num_splits || use_kvcache
-            ? get_lengths(o_perm, num_splits, shape_batch, nhead, shape_seqlen_q, hdim_v)
-            : std::array<ck_tile::index_t, 5>{1, 1, 1, 1, 1});
+        1 < num_splits || use_kvcache ? std::array<ck_tile::index_t, 5>{shape_batch,
+                                                                        nhead,
+                                                                        num_splits,
+                                                                        shape_seqlen_q,
+                                                                        hdim_v}
+                                      : std::array<ck_tile::index_t, 5>{1, 1, 1, 1, 1});
 
     // batch mode of lse data layout is [batch, nhead, seqlen_q]
     // group mode of lse data layout is [nhead, total_seqlen_q]
@@ -880,7 +866,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }();
         const ck_tile::index_t stride_bias    = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
         const ck_tile::index_t stride_randval = (max_seqlen_k);
-        const ck_tile::index_t stride_o_acc   = (o_perm ? hdim_v : nhead * hdim_v);
+        const ck_tile::index_t stride_o_acc   = (hdim_v);
         const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
         // setup nhead_stride_* arguments
         const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
@@ -906,8 +892,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
             (i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k);
         const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
-        const ck_tile::index_t nhead_stride_lse_acc = shape_seqlen_q;
-        const ck_tile::index_t nhead_stride_o_acc   = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
+        const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q);
+        const ck_tile::index_t nhead_stride_o_acc   = (num_splits * shape_seqlen_q * hdim_v);
         const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
         // setup batch_stride_* arguments
         const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q);
@@ -922,13 +908,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
         const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
         const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_lse_acc = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_o_acc   = (nhead * shape_seqlen_q * hdim_v);
-        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q);
+        const ck_tile::index_t batch_stride_o_acc = (nhead * num_splits * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_o     = (nhead * shape_seqlen_q * hdim_v);
         const ck_tile::index_t batch_stride_block_table = (max_num_page_blocks / batch);
         // setup split_stride_* arguments (only used in split-kv kernel)
-        const ck_tile::index_t split_stride_lse_acc = (shape_batch * nhead * shape_seqlen_q);
-        const ck_tile::index_t split_stride_o_acc = (shape_batch * nhead * shape_seqlen_q * hdim_v);
+        const ck_tile::index_t split_stride_lse_acc = (shape_seqlen_q);
+        const ck_tile::index_t split_stride_o_acc   = (shape_seqlen_q * hdim_v);
 
         args.q_ptr = q_buf.GetDeviceBuffer();
         args.k_ptr = k_buf.GetDeviceBuffer();
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index adabda165..8c1f6c805 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -69,7 +69,8 @@ struct FmhaFwdKernel
         // sync with generate.py
         // clang-format off
         using bfs = typename FmhaPipeline::BlockFmhaShape;
-        using gbr = typename bfs::Gemm0BlockWarps;
+        using g0br = typename bfs::Gemm0BlockWarps;
+        using g1br = typename bfs::Gemm1BlockWarps;
         using gwt = typename bfs::Gemm0WarpTile;
         #define _SS_  std::string
         #define _TS_  std::to_string
@@ -85,7 +86,8 @@ struct FmhaFwdKernel
             "_" + (kIsGroupMode ? "group" : "batch") + "_" + _SS_(TilePartitioner::name) + "_"
             "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
                     _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" +
-            "r" + _TS_(gbr::at(ck_tile::number<0>{})) + "x" + _TS_(gbr::at(ck_tile::number<1>{})) + "x" + _TS_(gbr::at(ck_tile::number<2>{})) + "_" +
+            "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
+            "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 34f75990c..ea30025b5 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -65,7 +65,8 @@ struct FmhaFwdSplitKVKernel
         // sync with generate.py
         // clang-format off
         using bfs = typename FmhaPipeline::BlockFmhaShape;
-        using gbr = typename bfs::Gemm0BlockWarps;
+        using g0br = typename bfs::Gemm0BlockWarps;
+        using g1br = typename bfs::Gemm1BlockWarps;
         using gwt = typename bfs::Gemm0WarpTile;
         #define _SS_  std::string
         #define _TS_  std::to_string
@@ -81,7 +82,8 @@ struct FmhaFwdSplitKVKernel
             "_" + (kIsGroupMode ? "group" : "batch") + "_"
             "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
                     _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" +
-            "r" + _TS_(gbr::at(ck_tile::number<0>{})) + "x" + _TS_(gbr::at(ck_tile::number<1>{})) + "x" + _TS_(gbr::at(ck_tile::number<2>{})) + "_" +
+            "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
+            "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
@@ -894,7 +896,7 @@ struct FmhaFwdSplitKVKernel
                 o_acc_ptr,
                 make_tuple(kargs.seqlen_q, kargs.hdim_v),
                 make_tuple(kargs.stride_o_acc, 1),
-                number<1>{},
+                number<FmhaPipeline::kAlignmentOacc>{},
                 number<1>{});
 
             return pad_tensor_view(
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
index 2d06ba176..675a31019 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
@@ -26,8 +26,8 @@ struct FmhaFwdSplitKVTilePartitioner
     {
         // TODO: this may need tuning
         return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) *
-                        ck_tile::integer_divide_ceil(hdim_v, kN1),
-                    nhead * num_splits,
+                        ck_tile::integer_divide_ceil(hdim_v, kN1) * num_splits,
+                    nhead,
                     batch_size);
     }
 
@@ -42,8 +42,9 @@ struct FmhaFwdSplitKVTilePartitioner
             return ck_tile::make_tuple(quotient, modulus);
         };
 
-        const auto [i_tile_m, i_tile_n] = f(blockIdx.x, num_tile_n1);
-        const auto [i_nhead, i_split]   = f(blockIdx.y, num_splits);
+        const auto [mn, i_split]        = f(blockIdx.x, num_splits);
+        const auto [i_tile_m, i_tile_n] = f(mn, num_tile_n1);
+        const index_t i_nhead           = blockIdx.y;
         const index_t i_batch           = blockIdx.z;
 
         return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 75af7be82..6e7416ce8 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -64,6 +64,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
     }();
 
+    static constexpr index_t kAlignmentOacc =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentOacc<Problem>();
+
     static constexpr index_t kAlignmentBias =
         kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
 
@@ -252,11 +255,11 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             k_dram_block_window_lengths, {adjusted_seqlen_k_start, 0});
 
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
-        auto bias_dram_window  = make_tile_window(
-            bias_dram_block_window_tmp.get_bottom_tensor_view(),
-            bias_dram_block_window_tmp.get_window_lengths(),
-            {bias_origin.at(number<0>{}), adjusted_seqlen_k_start}, // M/N
-            Policy::template MakeBiasDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto bias_dram_window =
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}), adjusted_seqlen_k_start}, // M/N
+                             Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
             v_dram_block_window_lengths,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp
index 338319ab3..b7f1f042e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp
@@ -9,11 +9,20 @@
 namespace ck_tile {
 
 // This pipeline is qkv all located in LDS
-using BlockFmhaFwdSplitKVPipelineQRKSVSDefaultPolicy =
-    BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                        /* AsyncCopyK = */ false,
-                                        /* AsyncCopyV = */ false,
-                                        /* NumPrefetchK = */ 1,
-                                        /* NumPrefetchV = */ 1>;
+struct BlockFmhaFwdSplitKVPipelineQRKSVSDefaultPolicy
+    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                          /* AsyncCopyK = */ false,
+                                          /* AsyncCopyV = */ false,
+                                          /* NumPrefetchK = */ 1,
+                                          /* NumPrefetchV = */ 1>
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc()
+    {
+        using OaccDataType = remove_cvref_t<typename Problem::OaccDataType>;
+
+        return static_cast<index_t>(16 / sizeof(OaccDataType));
+    }
+};
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 1846664e7..d9da2f088 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -39,8 +39,11 @@ struct BlockFmhaPipelineProblem
     using FmhaMask              = remove_cvref_t<FmhaMask_>;
     using Traits                = remove_cvref_t<Traits_>;
 
-    static constexpr index_t kBlockSize = BlockFmhaShape::NumWarps * get_warp_size();
-    static constexpr bool kIsGroupMode  = kIsGroupMode_;
+    static constexpr index_t kNumGemm0Warps = BlockFmhaShape::NumGemm0Warps;
+    static constexpr index_t kNumGemm1Warps = BlockFmhaShape::NumGemm1Warps;
+    static constexpr index_t kBlockSize     = BlockFmhaShape::NumWarps * get_warp_size();
+
+    static constexpr bool kIsGroupMode = kIsGroupMode_;
 
     // attributes from traits
     static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
@@ -84,8 +87,11 @@ struct BlockFmhaFwdSplitKVPipelineProblem
     using FmhaMask            = remove_cvref_t<FmhaMask_>;
     using Traits              = remove_cvref_t<Traits_>;
 
-    static constexpr index_t kBlockSize = BlockFmhaShape::NumWarps * get_warp_size();
-    static constexpr bool kIsGroupMode  = kIsGroupMode_;
+    static constexpr index_t kNumGemm0Warps = BlockFmhaShape::NumGemm0Warps;
+    static constexpr index_t kNumGemm1Warps = BlockFmhaShape::NumGemm1Warps;
+    static constexpr index_t kBlockSize     = BlockFmhaShape::NumWarps * get_warp_size();
+
+    static constexpr bool kIsGroupMode = kIsGroupMode_;
 
     // attributes from traits
     static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
index 281ddc07b..6837ffdee 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -242,11 +242,11 @@ struct BlockFmhaPipelineQRKSVS
                              {seqlen_k_start, 0});
 
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
-        auto bias_dram_window  = make_tile_window(
-            bias_dram_block_window_tmp.get_bottom_tensor_view(),
-            bias_dram_block_window_tmp.get_window_lengths(),
-            {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
-            Policy::template MakeBiasDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto bias_dram_window =
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
+                             Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0)>(
             randval_dram_block_window_tmp, seqlen_k_start);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index 19f569c45..c4872def1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -314,11 +314,11 @@ struct BlockFmhaPipelineQRKSVSAsync
         }();
 
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
-        auto bias_dram_window  = make_tile_window(
-            bias_dram_block_window_tmp.get_bottom_tensor_view(),
-            bias_dram_block_window_tmp.get_window_lengths(),
-            {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
-            Policy::template MakeBiasDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto bias_dram_window =
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
+                             Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0)>(
             randval_dram_block_window_tmp, seqlen_k_start);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
index bc9ca93d0..d08a8d489 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
@@ -9,9 +9,10 @@
 
 namespace ck_tile {
 
+/// NOTICE: we no-longer use this pipeline.
 // This pipeline is qkv all located in LDS
 template <typename Problem_, typename Policy_ = BlockFmhaPipelineQSKSVSDefaultPolicy>
-struct BlockFmhaPipelineQSKSVS
+struct [[deprecated]] BlockFmhaPipelineQSKSVS
 {
     using Problem               = remove_cvref_t<Problem_>;
     using Policy                = remove_cvref_t<Policy_>;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index a66d2be78..807ad6548 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -15,6 +15,7 @@
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
 
 // TODO: remove this
 #define K_LDS_LOAD_USE_OFFSET_TRANSFORM 0
@@ -64,13 +65,28 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
         constexpr index_t M1 = MWarp;
         constexpr index_t M0 = kMPerBlock / (M2 * M1);
 
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2>>,
-                                       tuple<sequence<1>, sequence<2, 1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       sequence<1, 2, 2>,
-                                       sequence<0, 0, 2>>{});
+        if constexpr(1 < Problem::kNumGemm0Warps)
+        {
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2>>,
+                                           tuple<sequence<1>, sequence<2, 1>>,
+                                           tuple<sequence<1>, sequence<1, 2>>,
+                                           sequence<1, 2, 2>,
+                                           sequence<0, 0, 2>>{});
+        }
+        else
+        {
+            static_assert(MWarp == 1);
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2>>,
+                                           tuple<sequence<2, 1>>,
+                                           tuple<sequence<1, 2>>,
+                                           sequence<1, 2, 2>,
+                                           sequence<0, 0, 2>>{});
+        }
     }
 
     template <typename Problem>
@@ -80,7 +96,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
             BlockGemmProblem<typename Problem::QDataType,
                              typename Problem::KDataType,
                              typename Problem::SaccDataType,
-                             Problem::kBlockSize,
+                             Problem::kNumGemm0Warps * get_warp_size(),
                              TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
                                                     Problem::BlockFmhaShape::kN0,
                                                     Problem::BlockFmhaShape::kK0>,
@@ -129,12 +145,16 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
                                                  typename Problem::BlockFmhaShape::Gemm0BlockWarps,
                                                  decltype(warp_gemm)>;
 
-        return BlockGemmARegBSmemCRegV2<GemmProblem, BlockGemmPolicy>{};
+        if constexpr(1 < Problem::kNumGemm0Warps)
+            return BlockGemmARegBSmemCRegV2<GemmProblem, BlockGemmPolicy>{};
+        else
+            return BlockGemmARegBSmemCRegOneWarpV1<GemmProblem, BlockGemmPolicy>{};
     }
 };
 
+/// NOTICE: we no-longer use this policy.
 template <>
-struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
+struct [[deprecated]] BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
 {
     static constexpr bool QLoadOnce = false;
 
@@ -364,12 +384,15 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
             constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
             constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+            constexpr index_t kMaxVecLoad =
+                min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
+            constexpr index_t kMinVecLoad = 4 / sizeof(VDataType);
 
-            // TODO: not correct!
-            if constexpr(total_pixels > 4)
-                return 4;
-            else
-                return 2;
+            constexpr index_t kVecLoad = ((total_pixels / kMaxVecLoad) >= kMinVecLoad)
+                                             ? kMaxVecLoad
+                                             : (total_pixels / kMinVecLoad);
+
+            return kVecLoad;
         }
         else
         {
@@ -383,10 +406,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         using BlockGemm = remove_cvref_t<decltype(QXPolicy::template GetQKBlockGemm<Problem>())>;
         constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
         using WG              = remove_cvref_t<decltype(config.template at<0>())>;
-        using CWarpDstr       = typename WG::CWarpDstr;
-        constexpr auto vec =
-            CWarpDstr{}.get_ys_to_d_descriptor().get_lengths().at(number<CWarpDstr::NDimY - 1>{});
-        return vec;
+
+        return WG::WarpGemmAttribute::Impl::kCM1PerLane;
     }
 
     template <typename Problem>
@@ -395,10 +416,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         using BlockGemm       = remove_cvref_t<decltype(GetKVBlockGemm<Problem>())>;
         constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
         using WG              = remove_cvref_t<decltype(config.template at<0>())>;
-        using CWarpDstr       = typename WG::CWarpDstr;
-        constexpr auto vec =
-            CWarpDstr{}.get_ys_to_d_descriptor().get_lengths().at(number<CWarpDstr::NDimY - 1>{});
-        return vec;
+
+        return WG::WarpGemmAttribute::Impl::kCM1PerLane;
     }
 
     template <typename Problem>
@@ -449,44 +468,12 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         return max(SingleKSize, SingleVSize);
     }
 
-    template <typename Problem, typename BlockGemm>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegBlockDescriptor()
-    {
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0BlockLength;
-
-        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-
-        using WG = remove_cvref_t<decltype(config.template at<0>())>;
-
-        constexpr index_t MWarp = config.template at<1>();
-        constexpr index_t NWarp = config.template at<2>();
-
-        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM);
-        constexpr index_t KIterPerWarp = kKPerBlock / WG::kK;
-
-        constexpr auto q_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<NWarp>,
-                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<1, 0>>,
-                                       tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-
-        constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            q_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
-
-        constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
-
-        return q_block_dstr;
-    }
-
     // TODO: this is used for non async copy desc. unify in the future
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsBlockDescriptor()
     {
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
         constexpr index_t kKPack     = GetSmemKPackK<Problem>();
 
         constexpr auto k_lds_block_desc_0 = make_naive_tensor_descriptor(
@@ -886,36 +873,10 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         }
     }
 
-    template <typename Problem, typename BlockGemm>
+    template <typename BlockGemm>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBiasDramTileDistribution()
     {
-        constexpr index_t MPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t NPerBlock = Problem::BlockFmhaShape::kN0;
-
-        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WG              = remove_cvref_t<decltype(config.template at<0>())>;
-
-        constexpr index_t MWarp = config.template at<1>();
-        constexpr index_t NWarp = config.template at<2>();
-
-        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
-        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
-
-        // Construct C-Block-HostTensor
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
-
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
-
-        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
-
-        return c_block_dstr;
+        return BlockGemm::MakeCBlockTile().get_tile_distribution();
     }
 
     template <typename Problem>
@@ -972,7 +933,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             BlockGemmProblem<typename Problem::PDataType,
                              typename Problem::VDataType,
                              typename Problem::OaccDataType,
-                             Problem::kBlockSize,
+                             Problem::kNumGemm1Warps * get_warp_size(),
                              TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
                                                     Problem::BlockFmhaShape::kN1,
                                                     Problem::BlockFmhaShape::kK1>,
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index 64a61e94d..f2bb2200f 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -21,10 +21,15 @@ struct TileFmhaShape
     using Gemm1BlockWarps = remove_cvref_t<Gemm1BlockWarps_>;
     using Gemm1WarpTile   = remove_cvref_t<Gemm1WarpTile_>;
 
-    static constexpr index_t NumWarps =
+    static constexpr index_t NumGemm0Warps =
         reduce_on_sequence(Gemm0BlockWarps{}, multiplies{}, number<1>{});
+    static constexpr index_t NumGemm1Warps =
+        reduce_on_sequence(Gemm1BlockWarps{}, multiplies{}, number<1>{});
+    static_assert(NumGemm1Warps % NumGemm0Warps == 0);
+
+    static constexpr index_t NumWarps = max(NumGemm0Warps, NumGemm1Warps);
 
-    static_assert(NumWarps == reduce_on_sequence(Gemm1BlockWarps{}, multiplies{}, number<1>{}));
+    static_assert(std::is_same_v<Gemm0WarpTile, Gemm1WarpTile>);
 
     static constexpr index_t kM0 = BlockTile::at(number<0>{}); // tile size along q seqlen
     static constexpr index_t kN0 = BlockTile::at(number<1>{}); // tile size along k seqlen
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index e70825570..4ca773479 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
index 9a5c2aae5..728a04d83 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -157,7 +157,7 @@ struct BlockGemmARegBRegCRegV1
         });
     }
 
-    CK_TILE_DEVICE constexpr auto MakeCBlockTile() const
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
new file mode 100644
index 000000000..ff23f6355
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
+
+namespace ck_tile {
+
+// A is block distributed tensor
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename Policy_ = BlockGemmARegBSmemCRegV1DefaultPolicy>
+struct BlockGemmARegBSmemCRegOneWarpV1
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using Policy         = remove_cvref_t<Policy_>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+    static_assert(kBlockSize == get_warp_size(), "Check failed!");
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockTensorTmp, typename BBlockWindowTmp>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockTensorTmp& a_block_tensor_tmp,
+                                   const BBlockWindowTmp& b_block_window_tmp) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>> &&
+                std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+            "wrong!");
+
+        // constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
+        // constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
+        // constexpr index_t KPerBlock = ABlockTensorTmp{}.get_lengths()[number<1>{}];
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+        constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+        // static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN &&
+        //                   KPerBlock == BlockGemmShape::kK,
+        //               "wrong!");
+
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        static_assert(MWarp == 1 && NWarp == 1, "Check failed!");
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr index_t NPerBlockPerIter = NPerBlock / NIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        const index_t iNWarp = 0;
+
+        constexpr auto a_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto c_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp>>,
+                                       tuple<>,
+                                       tuple<>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode);
+
+        // constrcut from A-block-tensor from A-Block-tensor-tmp
+        // FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent
+        // distribution
+        auto a_block_tensor =
+            make_static_distributed_tensor<typename ABlockTensorTmp::DataType>(a_block_dstr);
+
+        a_block_tensor.get_thread_buffer() = a_block_tensor_tmp.get_thread_buffer();
+
+        // construct B-warp-window
+        auto b_warp_window_tmp = make_tile_window(
+            b_block_window_tmp.get_bottom_tensor_view(),
+            make_tuple(number<WG::kN>{}, number<WG::kK>{}),
+            b_block_window_tmp.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0},
+            make_static_tile_distribution(typename WG::BWarpDstrEncoding{}));
+
+#if 0 // FIXME: using array will cause register spill
+        array<array<decltype(b_warp_window_tmp), KIterPerWarp>, NIterPerWarp> b_warp_windows{
+            {b_warp_window_tmp}};
+
+        for(index_t nIter = 0; nIter < NIterPerWarp; nIter++)
+        {
+            for(index_t kIter = 0; kIter < KIterPerWarp; kIter++)
+            {
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            }
+        }
+#else
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_warp_window_tmp), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_windows;
+
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+#endif
+
+        // check C-block-distribution
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
+                           remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "wrong!");
+
+        using AWarpDstr = typename WG::AWarpDstr;
+        using CWarpDstr = typename WG::CWarpDstr;
+
+        using AWarpTensor = typename WG::AWarpTensor;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto a_warp_y_lengths =
+            to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+        constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block tensor
+                AWarpTensor a_warp_tensor;
+
+                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B Block window
+                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        static_assert(MWarp == 1 && NWarp == 1, "Check failed!");
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        // constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr auto c_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp>>,
+                                       tuple<>,
+                                       tuple<>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        static_assert(decltype(c_block_dstr_encode)::NDimP == 1, "Check failed!");
+
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    // C = A * B
+    template <typename ABlockTensorTmp, typename BBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
+                                   const BBlockWindowTmp& b_block_window_tmp) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_tensor_tmp, b_block_window_tmp);
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
index beab457b9..98e5538c0 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
@@ -181,7 +181,7 @@ struct BlockGemmARegBSmemCRegV1
         });
     }
 
-    CK_TILE_DEVICE constexpr auto MakeCBlockTile() const
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
index 4a82702c1..173ef0a02 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
@@ -182,7 +182,7 @@ struct BlockGemmARegBSmemCRegV2
         });
     }
 
-    CK_TILE_DEVICE constexpr auto MakeCBlockTile() const
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
index 3d142df4d..d28aa9e78 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
@@ -180,7 +180,7 @@ struct BlockGemmASmemBRegCRegV1
         });
     }
 
-    CK_TILE_DEVICE constexpr auto MakeCBlockTile() const
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
index ac4522170..dc0b41135 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
@@ -167,7 +167,7 @@ struct BlockGemmASmemBSmemCRegV1
         });
     }
 
-    CK_TILE_DEVICE constexpr auto MakeCBlockTile() const
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 63c364331..a01265ad5 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -22,7 +22,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
 
     constexpr index_t idim_p_lane = NDimP - 1;
 
-    const auto ps_idx = make_array<index_t>(get_block_id(), get_lane_id());
+    const auto ps_idx = detail::get_partition_index(acc_tensor.get_tile_distribution());
     const auto rs_idx = acc_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
 
     constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size();
-- 
GitLab


From 31bf253aeb93bb7e26336d4940c6f056d7c5f1b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Sat, 26 Oct 2024 15:22:37 +0200
Subject: [PATCH 019/153] Add dynamic elementwise op (#1426)

* Add dynamic elementwise op

Co-authored-by: ThruptiRajLakshmanaGowda <thruptiraj.lakshmanagowda@amd.com>

* CI issues fix

* Custom parameter value for dynamic functions - Comments addressed

---------

Co-authored-by: ThruptiRajLakshmanaGowda <thruptiraj.lakshmanagowda@amd.com>
Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
---
 ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp |   6 +-
 example/62_convnd_activ/CMakeLists.txt        |   1 +
 .../dynamic_unary/CMakeLists.txt              |  45 +
 .../convnd_fwd_activ_dynamic_unary_common.hpp | 238 +++++
 .../convnd_fwd_xdl_dynamic_abs_fp16.cpp       |  13 +
 ...onvnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp |  13 +
 .../convnd_fwd_xdl_dynamic_elu_fp16.cpp       |  13 +
 .../convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp |  13 +
 .../convnd_fwd_xdl_dynamic_logistic_fp16.cpp  |  13 +
 ...onvnd_fwd_xdl_dynamic_passthrough_fp16.cpp |  13 +
 .../convnd_fwd_xdl_dynamic_pow_fp16.cpp       |  13 +
 .../convnd_fwd_xdl_dynamic_relu_fp16.cpp      |  13 +
 .../convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp   |  13 +
 .../convnd_fwd_xdl_dynamic_softrelu_fp16.cpp  |  13 +
 .../convnd_fwd_xdl_dynamic_swish_fp16.cpp     |  13 +
 .../convnd_fwd_xdl_dynamic_tanh_fp16.cpp      |  13 +
 .../run_convnd_activ_dynamic_example.inc      |  91 ++
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |  19 +-
 .../gpu/element/element_wise_operation.hpp    |   8 +-
 .../element/unary_element_wise_operation.hpp  | 877 +++++++++++++++---
 ...ouped_conv_fwd_xdl_dynamic_op_instance.hpp | 179 ++++
 ...grouped_convolution_forward_dynamic_op.hpp | 278 ++++++
 .../CMakeLists.txt                            |   8 +
 ...mic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  55 ++
 ...amic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  55 ++
 ...amic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  55 ++
 ...mic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp |  54 ++
 .../CMakeLists.txt                            |   8 +
 ..._op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  55 ++
 ...c_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp |  55 ++
 ...c_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp |  55 ++
 ..._op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp |  54 ++
 32 files changed, 2188 insertions(+), 164 deletions(-)
 create mode 100644 example/62_convnd_activ/dynamic_unary/CMakeLists.txt
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp
 create mode 100644 example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp
 create mode 100644 example/62_convnd_activ/run_convnd_activ_dynamic_example.inc
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp

diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
index 36dcf58d7..ff1282f3c 100644
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 /*
 Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
@@ -60,14 +60,14 @@ struct AddAddRelu
     {
         const ck::half_t x = c + d0 + d1;
 
-        ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x);
+        ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
     }
     __host__ __device__ void
     operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
     {
         const float x = c + (d0 + d1);
 
-        ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x);
+        ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
     }
 };
 
diff --git a/example/62_convnd_activ/CMakeLists.txt b/example/62_convnd_activ/CMakeLists.txt
index ab136d99b..79fafed4e 100644
--- a/example/62_convnd_activ/CMakeLists.txt
+++ b/example/62_convnd_activ/CMakeLists.txt
@@ -6,6 +6,7 @@ add_subdirectory(convscale_add)
 add_subdirectory(convscale_reduce)
 add_subdirectory(multi_AB)
 add_subdirectory(unary)
+add_subdirectory(dynamic_unary)
 
 add_custom_target(example_convnd_activ_xdl)
 # ScaleAdd ScaleAdd Relu
diff --git a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
new file mode 100644
index 000000000..23f07439a
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
@@ -0,0 +1,45 @@
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_dynamic_unary_xdl)
+      # Sigmoid
+      add_example_executable(example_convnd_fwd_xdl_dynamic_sigmoid_fp16 convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_sigmoid_fp16)
+      # Tanh
+      add_example_executable(example_convnd_fwd_xdl_dynamic_tanh_fp16 convnd_fwd_xdl_dynamic_tanh_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_tanh_fp16)
+      # Relu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_relu_fp16 convnd_fwd_xdl_dynamic_relu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_relu_fp16)
+      # SoftRelu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_softrelu_fp16 convnd_fwd_xdl_dynamic_softrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_softrelu_fp16)
+      # Abs
+      add_example_executable(example_convnd_fwd_xdl_dynamic_abs_fp16 convnd_fwd_xdl_dynamic_abs_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_abs_fp16)
+      # Pow
+      add_example_executable(example_convnd_fwd_xdl_dynamic_pow_fp16 convnd_fwd_xdl_dynamic_pow_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_pow_fp16)
+      # Clipped Relu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_clippedrelu_fp16 convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_clippedrelu_fp16)
+      # Leaky Relu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_leakyrelu_fp16 convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_leakyrelu_fp16)
+      # Elu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_elu_fp16 convnd_fwd_xdl_dynamic_elu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_elu_fp16)
+      # Swish
+      add_example_executable(example_convnd_fwd_xdl_dynamic_swish_fp16 convnd_fwd_xdl_dynamic_swish_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_swish_fp16)
+      # PassThrough
+      add_example_executable(example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fwd_xdl_dynamic_passthrough_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16)
+      # Logistic
+      add_example_executable(example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16)
+   
+   set(target 1)
+ endif()
+endforeach()
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
new file mode 100644
index 000000000..ed31be19e
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using InElementOp      = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp     = ck::tensor_operation::element_wise::PassThrough;
+using DynamicElementOp = ck::tensor_operation::element_wise::DynamicUnaryOp;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceGroupedConvNDActivInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        DynamicElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
new file mode 100644
index 000000000..8fa455c62
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::UnaryAbs out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
new file mode 100644
index 000000000..239a21525
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::ClippedRelu out_element_op(0.f, 1.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
new file mode 100644
index 000000000..23a094af7
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::Elu out_element_op(2.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
new file mode 100644
index 000000000..fe4b80a68
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::LeakyRelu out_element_op(0.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
new file mode 100644
index 000000000..756c07ed8
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::Logistic out_element_op(1.0f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
new file mode 100644
index 000000000..6588ec504
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::PassThrough out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
new file mode 100644
index 000000000..90f00a166
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::Power out_element_op(4.f, 1.f, 2.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
new file mode 100644
index 000000000..830297cb5
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::Relu out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
new file mode 100644
index 000000000..b143b4a4e
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::Sigmoid out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
new file mode 100644
index 000000000..83ba0f7f8
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::SoftRelu out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp
new file mode 100644
index 000000000..e862d1120
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::Swish out_element_op(1.0f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp
new file mode 100644
index 000000000..a91fc7ce3
--- /dev/null
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+
+    ck::tensor_operation::element_wise::TanH out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
diff --git a/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc
new file mode 100644
index 000000000..4e90cf936
--- /dev/null
+++ b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+template <typename OutElementOp>
+bool run_convnd_example(int argc, char* argv[], const OutElementOp& out_element_op)
+{
+    print_helper_msg();
+
+    bool do_verification = true;
+    // Use floats for SoftRelu by default to avoid overflow after e^x.
+    int init_method =
+        std::is_same_v<OutElementOp, ck::tensor_operation::element_wise::SoftRelu> ? 2 : 1;
+    bool time_kernel = false;
+
+    // Following shapes are selected to avoid overflow. Expect inf in case of
+    // size increase for some elementwise ops.
+    ck::utils::conv::ConvParam conv_param{
+        3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+
+    const auto run = [&]() {
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_grouped_conv<NDimSpatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp,
+                                DeviceGroupedConvNDActivInstance>(do_verification,
+                                                                  init_method,
+                                                                  time_kernel,
+                                                                  conv_param,
+                                                                  in_g_n_c_wis_desc,
+                                                                  wei_g_k_c_xs_desc,
+                                                                  out_g_n_k_wos_desc,
+                                                                  in_element_op,
+                                                                  wei_element_op,
+                                                                  out_element_op);
+    };
+
+    if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run();
+    }
+
+    return false;
+}
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 939ee1729..f21a45938 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -85,9 +85,9 @@ __global__ void
             BsPointer p_bs_grid,
             DsPointer p_ds_grid,
             EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
+            AElementwiseOperation a_element_op,
+            BElementwiseOperation b_element_op,
+            CDEElementwiseOperation cde_element_op,
             const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
             const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
             const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -121,6 +121,19 @@ __global__ void
     static_for<0, NumDTensor, 1>{}(
         [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; });
 
+    if constexpr(is_same_v<AElementwiseOperation, element_wise::DynamicUnaryOp>)
+    {
+        a_element_op.InitUnaryOpPtrOnDevice();
+    }
+    if constexpr(is_same_v<BElementwiseOperation, element_wise::DynamicUnaryOp>)
+    {
+        b_element_op.InitUnaryOpPtrOnDevice();
+    }
+    if constexpr(is_same_v<CDEElementwiseOperation, element_wise::DynamicUnaryOp>)
+    {
+        cde_element_op.InitUnaryOpPtrOnDevice();
+    }
+
     if constexpr(isMultiA || isMultiB)
     {
         AsPointer p_as_grid_grp;
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 135eaec93..b914c0b96 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -405,7 +405,7 @@ struct ScaleAddScaleAddRelu
                                                                               const float& d1) const
     {
         const float x = c * alpha1_ + alpha2_ * d0 + d1;
-        Relu{}.template operator()<float>(e, x);
+        e             = x > 0 ? x : 0;
     }
 
     template <>
@@ -416,7 +416,7 @@ struct ScaleAddScaleAddRelu
                         type_convert<float>(d1);
 
         float result = 0;
-        Relu{}.template operator()<float>(result, x);
+        result       = x > 0 ? x : 0;
 
         e = type_convert<half_t>(result);
     }
@@ -429,7 +429,7 @@ struct ScaleAddScaleAddRelu
                         type_convert<float>(d1);
 
         float result = 0;
-        Relu{}.template operator()<float>(result, x);
+        result       = x > 0 ? x : 0;
 
         e = type_convert<bhalf_t>(result);
     }
@@ -441,7 +441,7 @@ struct ScaleAddScaleAddRelu
         const float x = type_convert<float>(c) * alpha1_ + alpha2_ * d0 + d1;
 
         float result = 0;
-        Relu{}.template operator()<float>(result, x);
+        result       = x > 0 ? x : 0;
 
         e = type_convert<int8_t>(result);
     }
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index ab6b1691a..712b88618 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -7,11 +7,36 @@
 #include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
 #include "ck/utility/type_convert.hpp"
+#include <cassert>
 
 namespace ck {
 namespace tensor_operation {
 namespace element_wise {
 
+struct UnaryOpBase
+{
+    public:
+    __host__ __device__ virtual ~UnaryOpBase() = default;
+
+    __host__ __device__ UnaryOpBase()                   = default;
+    __host__ __device__ UnaryOpBase(const UnaryOpBase&) = default;
+    __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default;
+    __host__ __device__ UnaryOpBase(UnaryOpBase&&)                 = default;
+    __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default;
+
+    __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0;
+
+    __host__ __device__ virtual inline void operator()(double& y, const double& x) const = 0;
+
+    __host__ __device__ virtual inline void operator()(int32_t& y, const int32_t& x) const = 0;
+
+    __host__ __device__ virtual inline void operator()(int8_t& y, const int8_t& x) const = 0;
+
+    __host__ __device__ virtual inline void operator()(half_t& y, const half_t& x) const = 0;
+
+    __host__ __device__ virtual inline void operator()(bhalf_t& y, const bhalf_t& x) const = 0;
+};
+
 struct PassThroughPack2
 {
     template <typename Y, typename X>
@@ -25,17 +50,24 @@ struct PassThroughPack2
     constexpr const static bool is_pack2_invocable = true;
 };
 
-struct PassThrough
+struct PassThrough : public UnaryOpBase
 {
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final { y = x; }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final { y = x; }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final { y = x; }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final { y = x; }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final { y = x; }
+
     template <typename Y, typename X>
     __host__ __device__ void operator()(Y& y, const X& x) const;
 
-    template <>
-    __host__ __device__ void operator()<double, double>(double& y, const double& x) const
-    {
-        y = x;
-    }
-
     template <>
     __host__ __device__ void operator()<float, double>(float& y, const double& x) const
     {
@@ -48,36 +80,12 @@ struct PassThrough
         y = type_convert<double>(x);
     }
 
-    template <>
-    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
-    {
-        y = x;
-    }
-
     template <>
     __host__ __device__ void operator()<half_t, float>(half_t& y, const float& x) const
     {
         y = type_convert<half_t>(x);
     }
 
-    template <>
-    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    __host__ __device__ void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
-    {
-        y = x;
-    }
-
     template <>
     __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
     {
@@ -102,12 +110,6 @@ struct PassThrough
         y = type_convert<float>(x);
     }
 
-    template <>
-    __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
-    {
-        y = x;
-    }
-
     template <>
     __host__ __device__ void operator()<half_t, int8_t>(half_t& y, const int8_t& x) const
     {
@@ -407,20 +409,38 @@ struct UnarySquare
     };
 };
 
-struct UnaryAbs
+struct UnaryAbs : public UnaryOpBase
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
+        y = ck::math::abs(x);
+    }
 
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
         y = ck::math::abs(x);
-    };
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        y = ck::math::abs(x);
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        y = ck::math::abs(x);
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        y = ck::math::abs(x);
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        y = ck::math::abs(x);
+    }
 
-    template <>
     __host__ __device__ void operator()(f8_t& y, const f8_t& x) const
     {
         y = ck::type_convert<f8_t>(ck::math::abs(ck::type_convert<float>(x)));
@@ -439,20 +459,34 @@ struct UnarySqrt
     };
 };
 
-struct Relu
+struct Relu : public UnaryOpBase
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
         y = x > 0 ? x : 0;
     }
 
-    template <>
-    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        y = x > 0 ? x : 0;
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        y = x > 0 ? x : 0;
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        y = x > 0 ? x : 0;
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        y = x > 0 ? x : 0;
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
     {
         float x_f32 = ck::type_convert<float>(x);
         float y_f32 = x_f32 > 0 ? x_f32 : 0;
@@ -599,18 +633,46 @@ struct Gelu
     }
 };
 
-struct Sigmoid
+struct Sigmoid : public UnaryOpBase
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
-                          is_same<T, int32_t>::value,
-                      "Data type is not supported by this operation!");
-        constexpr T one = type_convert<T>(1);
-        y               = one / (one + ck::math::exp(-x));
-    };
+        constexpr float one = type_convert<float>(1);
+        y                   = one / (one + ck::math::exp(-x));
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        constexpr double one = type_convert<double>(1);
+        y                    = one / (one + ck::math::exp(-x));
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        constexpr int32_t one = type_convert<int32_t>(1);
+        y                     = one / (one + ck::math::exp(-x));
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        constexpr int8_t one = type_convert<int8_t>(1);
+        y                    = one / (one + ck::math::exp(-x));
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        constexpr half_t one = type_convert<half_t>(1);
+        y                    = one / (one + ck::math::exp(-x));
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        constexpr float one = type_convert<float>(1);
+        float x_f32         = ck::type_convert<float>(x);
+        float y_f32         = one / (one + ck::math::exp(x_f32));
+        y                   = ck::type_convert<bhalf_t>(y_f32);
+    }
 };
 
 struct Silu
@@ -626,18 +688,37 @@ struct Silu
     };
 };
 
-struct TanH
+struct TanH : public UnaryOpBase
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
-                          is_same<T, int32_t>::value,
-                      "Data type is not supported by this operation!");
+        y = ck::math::tanh(x);
+    }
 
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
         y = ck::math::tanh(x);
-    };
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
 };
 
 struct ACos
@@ -878,138 +959,393 @@ struct Rcp
     };
 };
 
-struct Swish
+struct Swish : public UnaryOpBase
 {
-    Swish(float beta = 1.0f) : beta_(beta) {}
+    __host__ __device__ Swish(float beta = 1.0f) : beta_(beta) {}
+
+    __host__ __device__ float get_beta() const { return beta_; }
+
+    const float beta_;
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<float>(x / (1.f + ck::math::exp(bx)));
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<double>(x / (1.f + ck::math::exp(bx)));
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<int32_t>(x / (1.f + ck::math::exp(bx)));
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<int8_t>(x / (1.f + ck::math::exp(bx)));
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<half_t>(x / (1.f + ck::math::exp(bx)));
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<bhalf_t>(x / (1.f + ck::math::exp(bx)));
+    }
 
     template <typename Y, typename X>
     __host__ __device__ void operator()(Y& y, const X& x) const
     {
         static_assert(is_same<X, float>::value || is_same<X, double>::value ||
-                          is_same<X, ck::half_t>::value,
+                          is_same<X, half_t>::value,
                       "Data type is not supported by this operation!");
 
         static_assert(is_same<Y, float>::value || is_same<Y, double>::value ||
-                          is_same<Y, ck::half_t>::value,
+                          is_same<Y, half_t>::value,
                       "Data type is not supported by this operation!");
 
         float bx = -beta_ * type_convert<float>(x);
         y        = type_convert<Y>(x / (1.f + ck::math::exp(bx)));
-    };
-
-    const float beta_;
+    }
 };
 
-struct SoftRelu
+struct SoftRelu : public UnaryOpBase
 {
-    SoftRelu(float alpha = 1.f) : alpha_(alpha){};
+    __host__ __device__ SoftRelu(float alpha = 1.0f) : alpha_(alpha) {}
 
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ float get_alpha() const { return alpha_; }
+
+    const float alpha_;
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha  = type_convert<T>(alpha_);
-        constexpr T one = type_convert<T>(1);
-        y               = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+        float casted_alpha  = type_convert<float>(alpha_);
+        constexpr float one = type_convert<float>(1);
+        y                   = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha  = type_convert<double>(alpha_);
+        constexpr double one = type_convert<double>(1);
+        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha  = type_convert<int32_t>(alpha_);
+        constexpr int32_t one = type_convert<int32_t>(1);
+        y                     = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha  = type_convert<int8_t>(alpha_);
+        constexpr int8_t one = type_convert<int8_t>(1);
+        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha  = type_convert<half_t>(alpha_);
+        constexpr half_t one = type_convert<half_t>(1);
+        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha  = type_convert<bhalf_t>(alpha_);
+        constexpr bhalf_t one = type_convert<bhalf_t>(1);
+        y                     = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
     }
-    const float alpha_;
 };
 
-struct Power
+struct Power : public UnaryOpBase
 {
-    Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
-        : alpha_(alpha), beta_(beta), gamma_(gamma){};
-
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
+        : alpha_(alpha), beta_(beta), gamma_(gamma)
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha     = type_convert<T>(alpha_);
-        T casted_beta      = type_convert<T>(beta_);
-        T casted_gamma     = type_convert<T>(gamma_);
-        T shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                  = ck::math::pow(shifted_scaled_x, casted_gamma);
     }
+
+    __host__ __device__ float get_alpha() const { return alpha_; }
+
+    __host__ __device__ float get_beta() const { return beta_; }
+
+    __host__ __device__ float get_gamma() const { return gamma_; }
+
     const float alpha_;
     const float beta_;
     const float gamma_;
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
+    {
+        float casted_alpha = type_convert<float>(alpha_);
+        float casted_beta  = type_convert<float>(beta_);
+        float casted_gamma = type_convert<float>(gamma_);
+
+        float shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                      = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        double casted_beta  = type_convert<double>(beta_);
+        double casted_gamma = type_convert<double>(gamma_);
+
+        double shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        int32_t casted_beta  = type_convert<int32_t>(beta_);
+        int32_t casted_gamma = type_convert<int32_t>(gamma_);
+
+        int32_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                        = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        int8_t casted_beta  = type_convert<int8_t>(beta_);
+        int8_t casted_gamma = type_convert<int8_t>(gamma_);
+
+        int8_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        half_t casted_beta  = type_convert<half_t>(beta_);
+        half_t casted_gamma = type_convert<half_t>(gamma_);
+
+        half_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
+        bhalf_t casted_beta  = type_convert<bhalf_t>(beta_);
+        bhalf_t casted_gamma = type_convert<bhalf_t>(gamma_);
+
+        bhalf_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                        = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
 };
 
-struct ClippedRelu
+struct ClippedRelu : public UnaryOpBase
 {
-    ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){};
-
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ ClippedRelu(float alpha = 0.f, float beta = 1.f)
+        : alpha_(alpha), beta_(beta)
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha = type_convert<T>(alpha_);
-        T casted_beta  = type_convert<T>(beta_);
-        y              = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
     }
+
+    __host__ __device__ float get_alpha() const { return alpha_; }
+
+    __host__ __device__ float get_beta() const { return beta_; }
+
     const float alpha_;
     const float beta_;
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
+    {
+        float casted_alpha = type_convert<float>(alpha_);
+        float casted_beta  = type_convert<float>(beta_);
+        y                  = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        double casted_beta  = type_convert<double>(beta_);
+        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        int32_t casted_beta  = type_convert<int32_t>(beta_);
+        y                    = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        int8_t casted_beta  = type_convert<int8_t>(beta_);
+        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        half_t casted_beta  = type_convert<half_t>(beta_);
+        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
+        bhalf_t casted_beta  = type_convert<bhalf_t>(beta_);
+        y                    = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
 };
 
-struct LeakyRelu
+struct LeakyRelu : public UnaryOpBase
 {
-    LeakyRelu(float alpha = 0.01f) : alpha_(alpha){};
 
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ LeakyRelu(float alpha = 0.f) : alpha_(alpha) {}
+
+    __host__ __device__ float get_alpha() const { return alpha_; }
+
+    const float alpha_;
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
+    {
+        float casted_alpha = type_convert<float>(alpha_);
+        y                  = x >= 0 ? x : x * casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        y                   = x >= 0 ? x : x * casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        y                    = x >= 0 ? x : x * casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        y                   = x >= 0 ? x : x * casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        y                   = x >= 0 ? x : x * casted_alpha;
+    }
+
+    __host__ __device__ inline void operator()([[maybe_unused]] bhalf_t& y,
+                                               [[maybe_unused]] const bhalf_t& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha = type_convert<T>(alpha_);
-        y              = x >= 0 ? x : x * casted_alpha;
     }
-    const float alpha_;
 };
 
-struct Elu
+struct Elu : public UnaryOpBase
 {
-    Elu(float alpha = 1.f) : alpha_(alpha){};
 
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ Elu(float alpha = 1.f) : alpha_(alpha) {}
+
+    __host__ __device__ float get_alpha() const { return alpha_; }
+
+    const float alpha_;
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha = type_convert<T>(alpha_);
-        y              = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+        float casted_alpha = type_convert<float>(alpha_);
+        y                  = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        y                    = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
+        y                    = x > 0 ? x : casted_alpha * ck::math::expm1(x);
     }
-    const float alpha_;
 };
 
-struct Logistic
+struct Logistic : public UnaryOpBase
 {
-    Logistic(float alpha = 1.f) : alpha_(alpha){};
 
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ Logistic(float alpha = 1.0f) : alpha_(alpha) {}
+
+    __host__ __device__ float get_alpha() const { return alpha_; }
+
+    const float alpha_;
+
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha  = type_convert<T>(alpha_);
-        constexpr T one = type_convert<T>(1);
-        y               = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+        float casted_alpha  = type_convert<float>(alpha_);
+        constexpr float one = type_convert<float>(1);
+        y                   = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha  = type_convert<double>(alpha_);
+        constexpr double one = type_convert<double>(1);
+        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha  = type_convert<int32_t>(alpha_);
+        constexpr int32_t one = type_convert<int32_t>(1);
+        y                     = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha  = type_convert<int8_t>(alpha_);
+        constexpr int8_t one = type_convert<int8_t>(1);
+        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha  = type_convert<half_t>(alpha_);
+        constexpr half_t one = type_convert<half_t>(1);
+        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha  = type_convert<bhalf_t>(alpha_);
+        constexpr bhalf_t one = type_convert<bhalf_t>(1);
+        y                     = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
     }
-    const float alpha_;
 };
 
 struct ConvInvscale
@@ -1074,7 +1410,7 @@ struct ConvScaleRelu
     __host__ __device__ void operator()<f8_t, float>(f8_t& e, const float& c) const
     {
         float x;
-        Relu{}.template operator()<float>(x, c * scale_in_ * scale_wei_);
+        Relu{}(x, c * scale_in_ * scale_wei_);
         e = type_convert<f8_t>(x * scale_out_);
     };
 
@@ -1153,6 +1489,239 @@ struct FastNumericArrayConverter<uint8_t, ck::half_t, N>
     __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); }
 };
 
+struct DynamicUnaryOp
+{
+
+    DynamicUnaryOp& operator=(const DynamicUnaryOp& other)
+    {
+        if(this != &other)
+        {
+            unary_op_ptr_  = other.unary_op_ptr_;
+            unary_op_type_ = other.unary_op_type_;
+        }
+        return *this;
+    }
+
+    __host__ __device__ DynamicUnaryOp() = delete;
+
+    __host__ __device__ DynamicUnaryOp(const Swish& swish)
+    {
+        unary_op_type_ = UnaryOpType::Swish;
+        beta           = swish.get_beta();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const Swish&& swish)
+    {
+        unary_op_type_ = UnaryOpType::Swish;
+        beta           = swish.get_beta();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const Sigmoid&) { unary_op_type_ = UnaryOpType::Sigmoid; }
+
+    __host__ __device__ DynamicUnaryOp(const Sigmoid&&) { unary_op_type_ = UnaryOpType::Sigmoid; }
+
+    __host__ __device__ DynamicUnaryOp(const PassThrough&)
+    {
+        unary_op_type_ = UnaryOpType::PassThrough;
+    }
+
+    __host__ __device__ DynamicUnaryOp(const PassThrough&&)
+    {
+        unary_op_type_ = UnaryOpType::PassThrough;
+    }
+
+    __host__ __device__ DynamicUnaryOp(const Logistic& logistic)
+    {
+        unary_op_type_ = UnaryOpType::Logistic;
+        alpha          = logistic.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const Logistic&& logistic)
+    {
+        unary_op_type_ = UnaryOpType::Logistic;
+        alpha          = logistic.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const TanH&) { unary_op_type_ = UnaryOpType::TanH; }
+
+    __host__ __device__ DynamicUnaryOp(const TanH&&) { unary_op_type_ = UnaryOpType::TanH; }
+
+    __host__ __device__ DynamicUnaryOp(const Relu&) { unary_op_type_ = UnaryOpType::Relu; }
+
+    __host__ __device__ DynamicUnaryOp(const Relu&&) { unary_op_type_ = UnaryOpType::Relu; }
+
+    __host__ __device__ DynamicUnaryOp(const SoftRelu& softrelu)
+    {
+        unary_op_type_ = UnaryOpType::SoftRelu;
+        alpha          = softrelu.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const SoftRelu&& softrelu)
+    {
+        unary_op_type_ = UnaryOpType::SoftRelu;
+        alpha          = softrelu.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const UnaryAbs&) { unary_op_type_ = UnaryOpType::UnaryAbs; }
+
+    __host__ __device__ DynamicUnaryOp(const UnaryAbs&&) { unary_op_type_ = UnaryOpType::UnaryAbs; }
+
+    __host__ __device__ DynamicUnaryOp(const Power& pow)
+    {
+        unary_op_type_ = UnaryOpType::Power;
+        alpha          = pow.get_alpha();
+        beta           = pow.get_beta();
+        gamma          = pow.get_gamma();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const Power&& pow)
+    {
+        unary_op_type_ = UnaryOpType::Power;
+        alpha          = pow.get_alpha();
+        beta           = pow.get_beta();
+        gamma          = pow.get_gamma();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const ClippedRelu& clippedrelu)
+    {
+        unary_op_type_ = UnaryOpType::ClippedRelu;
+        alpha          = clippedrelu.get_alpha();
+        beta           = clippedrelu.get_beta();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const ClippedRelu&& clippedrelu)
+    {
+        unary_op_type_ = UnaryOpType::ClippedRelu;
+        alpha          = clippedrelu.get_alpha();
+        beta           = clippedrelu.get_beta();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const LeakyRelu& leakyrelu)
+    {
+        unary_op_type_ = UnaryOpType::LeakyRelu;
+        alpha          = leakyrelu.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const LeakyRelu&& leakyrelu)
+    {
+        unary_op_type_ = UnaryOpType::LeakyRelu;
+        alpha          = leakyrelu.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const Elu& elu)
+    {
+        unary_op_type_ = UnaryOpType::Elu;
+        alpha          = elu.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const Elu&& elu)
+    {
+        unary_op_type_ = UnaryOpType::Elu;
+        alpha          = elu.get_alpha();
+    }
+
+    __host__ __device__ DynamicUnaryOp(const DynamicUnaryOp& dynamic_op)
+        : unary_op_type_(dynamic_op.unary_op_type_),
+          unary_op_ptr_(dynamic_op.unary_op_ptr_),
+          alpha(dynamic_op.alpha),
+          beta(dynamic_op.beta),
+          gamma(dynamic_op.gamma)
+    {
+    }
+
+    __host__ __device__ ~DynamicUnaryOp()
+    {
+        if(unary_op_ptr_)
+            delete unary_op_ptr_;
+    }
+
+    __device__ void InitUnaryOpPtrOnDevice()
+    {
+        switch(unary_op_type_)
+        {
+        case(UnaryOpType::Swish): unary_op_ptr_ = new Swish(beta); break;
+        case(UnaryOpType::Sigmoid): unary_op_ptr_ = new Sigmoid; break;
+        case(UnaryOpType::PassThrough): unary_op_ptr_ = new PassThrough; break;
+        case(UnaryOpType::Logistic): unary_op_ptr_ = new Logistic(alpha); break;
+        case(UnaryOpType::TanH): unary_op_ptr_ = new TanH; break;
+        case(UnaryOpType::Relu): unary_op_ptr_ = new Relu; break;
+        case(UnaryOpType::SoftRelu): unary_op_ptr_ = new SoftRelu(alpha); break;
+        case(UnaryOpType::UnaryAbs): unary_op_ptr_ = new UnaryAbs; break;
+        case(UnaryOpType::Power): unary_op_ptr_ = new Power(alpha, beta, gamma); break;
+        case(UnaryOpType::ClippedRelu): unary_op_ptr_ = new ClippedRelu(alpha, beta); break;
+        case(UnaryOpType::LeakyRelu): unary_op_ptr_ = new LeakyRelu(alpha); break;
+        case(UnaryOpType::Elu): unary_op_ptr_ = new Elu(alpha); break;
+
+        default: unary_op_ptr_ = nullptr; break;
+        }
+    }
+
+    template <typename Y, typename X>
+    __device__ void operator()(Y& y, const X& x) const
+    {
+        isSupported<X, Y>();
+        unary_op_ptr_->operator()(y, x);
+    }
+
+    template <typename Y, typename X>
+    __host__ void operator()(Y& y, const X& x) const
+    {
+        isSupported<X, Y>();
+        switch(unary_op_type_)
+        {
+        case(UnaryOpType::Swish): Swish{}.operator()(y, x); break;
+        case(UnaryOpType::Sigmoid): Sigmoid{}.operator()(y, x); break;
+        case(UnaryOpType::PassThrough): PassThrough{}.operator()(y, x); break;
+        case(UnaryOpType::Logistic): Logistic{}.operator()(y, x); break;
+        case(UnaryOpType::TanH): TanH{}.operator()(y, x); break;
+        case(UnaryOpType::Relu): Relu{}.operator()(y, x); break;
+        case(UnaryOpType::SoftRelu): SoftRelu{}.operator()(y, x); break;
+        case(UnaryOpType::UnaryAbs): UnaryAbs{}.operator()(y, x); break;
+        case(UnaryOpType::Power): Power{}.operator()(y, x); break;
+        case(UnaryOpType::ClippedRelu): ClippedRelu{}.operator()(y, x); break;
+        case(UnaryOpType::LeakyRelu): LeakyRelu{}.operator()(y, x); break;
+        case(UnaryOpType::Elu): Elu{}.operator()(y, x); break;
+        default: break;
+        }
+    }
+
+    template <typename X, typename Y>
+    __device__ __host__ constexpr void isSupported() const
+    {
+
+        static_assert(std::is_same<X, Y>::value, "X and Y must be of the same type");
+
+        static_assert(is_same<X, float>::value || is_same<X, double>::value ||
+                          is_same<X, bhalf_t>::value || is_same<X, half_t>::value ||
+                          is_same<X, int32_t>::value || is_same<X, int8_t>::value,
+                      "Data type is not supported by this operation!");
+    }
+
+    private:
+    enum class UnaryOpType
+    {
+        Swish,
+        Sigmoid,
+        PassThrough,
+        Logistic,
+        TanH,
+        Relu,
+        SoftRelu,
+        UnaryAbs,
+        Power,
+        ClippedRelu,
+        LeakyRelu,
+        Elu
+    };
+
+    public:
+    UnaryOpType unary_op_type_;
+    UnaryOpBase* unary_op_ptr_ = nullptr;
+    float alpha;
+    float beta;
+    float gamma;
+};
+
 } // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
new file mode 100644
index 000000000..9db675a51
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using namespace ck::tensor_layout::convolution;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using DynamicUnaryOp = ck::tensor_operation::element_wise::DynamicUnaryOp;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 = ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 = ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp
new file mode 100644
index 000000000..5efee69b2
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dynamic.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using DynamicUnaryOp = ck::tensor_operation::element_wise::DynamicUnaryOp;
+
+#ifdef CK_ENABLE_BF16
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                ck::Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                ck::Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                ck::Tuple<>,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+#ifdef CK_ENABLE_BF16
+// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                ck::Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                ck::Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                ck::Tuple<>,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances);
+#endif
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DLayouts,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DDataTypes,
+          typename OutDataType,
+          typename ComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::DynamicUnaryOp,
+    ComputeType>>
+{
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DLayouts,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DDataTypes,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::DynamicUnaryOp,
+                                        ComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK> &&
+                     DLayouts::Size() == 0)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_INT8
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK> &&
+                          DLayouts::Size() == 0)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_INT8
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt
new file mode 100644
index 000000000..92735fcae
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt
@@ -0,0 +1,8 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV2D_FWD_DYNAMIC_OP
+   xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp)
+
+add_instance_library(device_grouped_conv2d_fwd_dynamic_op_instance ${GROUPED_CONV2D_FWD_DYNAMIC_OP})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 000000000..853470e1c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                ck::Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
new file mode 100644
index 000000000..725b9ca0d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                ck::Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2,
+                                                             NHWGC,
+                                                             GKYXC,
+                                                             Tuple<>,
+                                                             NHWGK,
+                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2,
+                                                             NHWGC,
+                                                             GKYXC,
+                                                             Tuple<>,
+                                                             NHWGK,
+                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2,
+                                                             NHWGC,
+                                                             GKYXC,
+                                                             Tuple<>,
+                                                             NHWGK,
+                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
new file mode 100644
index 000000000..fbd5fe370
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2,
+                                                             NHWGC,
+                                                             GKYXC,
+                                                             Tuple<>,
+                                                             NHWGK,
+                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2,
+                                                             NHWGC,
+                                                             GKYXC,
+                                                             Tuple<>,
+                                                             NHWGK,
+                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2,
+                                                             NHWGC,
+                                                             GKYXC,
+                                                             Tuple<>,
+                                                             NHWGK,
+                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
new file mode 100644
index 000000000..6bfc29537
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                ck::Tuple<>,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                ck::Tuple<>,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt
new file mode 100644
index 000000000..3b8ebbffd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt
@@ -0,0 +1,8 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV3D_FWD_DYNAMIC_OP
+   xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp)
+
+add_instance_library(device_grouped_conv3d_fwd_dynamic_op_instance ${GROUPED_CONV3D_FWD_DYNAMIC_OP})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 000000000..249dfaa4d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                ck::Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
new file mode 100644
index 000000000..75c4ddc35
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                ck::Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
new file mode 100644
index 000000000..2e237e07b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3,
+                                                             NDHWGC,
+                                                             GKZYXC,
+                                                             Tuple<>,
+                                                             NDHWGK,
+                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
new file mode 100644
index 000000000..e38f1acbd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                ck::Tuple<>,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DynamicUnaryOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
-- 
GitLab


From b098b71b05e4c06310f2e74056282a796f3cfd13 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Sat, 26 Oct 2024 23:52:49 +0800
Subject: [PATCH 020/153] topk_softmax (#1592)

* topk_softmax

* remove some file

* fix atomix linear_offset

* address various comment, and change sfc get_index api to static(tuple)
---
 .../ck_tile/09_topk_softmax/CMakeLists.txt    |    8 +
 example/ck_tile/09_topk_softmax/README.md     |   28 +
 .../09_topk_softmax/script/smoke_test.sh      |   22 +
 .../ck_tile/09_topk_softmax/topk_softmax.cpp  |  299 +++++
 .../09_topk_softmax/topk_softmax_api.cpp      |   96 ++
 .../09_topk_softmax/topk_softmax_api.hpp      |   21 +
 example/ck_tile/CMakeLists.txt                |    2 +
 include/ck_tile/core.hpp                      |    1 +
 .../core/algorithm/space_filling_curve.hpp    |   12 +-
 .../core/arch/amd_buffer_addressing.hpp       |  213 ++-
 include/ck_tile/core/config.hpp               |   18 +
 include/ck_tile/core/container/tuple.hpp      |   34 +-
 include/ck_tile/core/numeric/math.hpp         |  972 +++++++++++++-
 include/ck_tile/core/tensor/buffer_view.hpp   |  178 ++-
 include/ck_tile/core/tensor/load_tile.hpp     |   60 +-
 include/ck_tile/core/tensor/shuffle_tile.hpp  |    2 +-
 include/ck_tile/core/tensor/store_tile.hpp    |   31 +-
 include/ck_tile/core/tensor/tensor_view.hpp   |  217 ++-
 include/ck_tile/core/tensor/tile_window.hpp   |  210 ++-
 .../core/tensor/tile_window_linear.hpp        | 1082 +++++++++++++++
 include/ck_tile/core/utility/magic_div.hpp    |   27 +-
 include/ck_tile/host.hpp                      |    1 +
 include/ck_tile/host/fill.hpp                 |   68 +
 include/ck_tile/host/host_tensor.hpp          |   23 +
 .../host/reference/reference_softmax.hpp      |   80 +-
 .../ck_tile/host/reference/reference_topk.hpp |  124 ++
 include/ck_tile/ops/elementwise.hpp           |    7 +
 .../unary_element_wise_operation.hpp          | 1163 +++++++++++++++++
 .../block_fmha_pipeline_qr_ks_vs_async.hpp    |    4 +-
 .../ck_tile/ops/reduce/block/block_reduce.hpp |  170 +++
 include/ck_tile/ops/softmax.hpp               |    8 +
 .../ops/softmax/block/block_softmax_2d.hpp    |   81 ++
 .../block/block_softmax_2d_problem.hpp        |   16 +
 include/ck_tile/ops/topk.hpp                  |    8 +
 .../ops/topk/block/block_topk_stream_2d.hpp   |  113 ++
 .../block/block_topk_stream_2d_problem.hpp    |   22 +
 include/ck_tile/ops/topk_softmax.hpp          |   10 +
 .../kernel/topk_softmax_kernel.hpp            |  166 +++
 .../topk_softmax_warp_per_row_pipeline.hpp    |  123 ++
 .../topk_softmax_warp_per_row_policy.hpp      |   63 +
 .../topk_softmax_warp_per_row_problem.hpp     |   46 +
 41 files changed, 5603 insertions(+), 226 deletions(-)
 create mode 100644 example/ck_tile/09_topk_softmax/CMakeLists.txt
 create mode 100644 example/ck_tile/09_topk_softmax/README.md
 create mode 100644 example/ck_tile/09_topk_softmax/script/smoke_test.sh
 create mode 100644 example/ck_tile/09_topk_softmax/topk_softmax.cpp
 create mode 100644 example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
 create mode 100644 example/ck_tile/09_topk_softmax/topk_softmax_api.hpp
 create mode 100644 include/ck_tile/core/tensor/tile_window_linear.hpp
 create mode 100644 include/ck_tile/host/reference/reference_topk.hpp
 create mode 100644 include/ck_tile/ops/elementwise.hpp
 create mode 100644 include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
 create mode 100644 include/ck_tile/ops/softmax.hpp
 create mode 100644 include/ck_tile/ops/softmax/block/block_softmax_2d.hpp
 create mode 100644 include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp
 create mode 100644 include/ck_tile/ops/topk.hpp
 create mode 100644 include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp
 create mode 100644 include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp
 create mode 100644 include/ck_tile/ops/topk_softmax.hpp
 create mode 100644 include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
 create mode 100644 include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
 create mode 100644 include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp
 create mode 100644 include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp

diff --git a/example/ck_tile/09_topk_softmax/CMakeLists.txt b/example/ck_tile/09_topk_softmax/CMakeLists.txt
new file mode 100644
index 000000000..b43b98979
--- /dev/null
+++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
+target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+target_compile_options(tile_example_topk_softmax PRIVATE ${EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS})
diff --git a/example/ck_tile/09_topk_softmax/README.md b/example/ck_tile/09_topk_softmax/README.md
new file mode 100644
index 000000000..104301290
--- /dev/null
+++ b/example/ck_tile/09_topk_softmax/README.md
@@ -0,0 +1,28 @@
+# topk-softmax
+
+This folder contains example for topk-softmax kernel using ck_tile tile-programming implementation. This kernel is often used in Moe model, before launching the fused-moe-gemm block. The input is a `token*expert` 2d matrix. The op will do a softmax per row(`expert`), then find the `topk` value for each row. Output is a `token*topk`  weight(usually fp32) and index(int32) 2d tensor.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_topk_softmax -j
+```
+This will result in an executable `build/bin/tile_example_topk_softmax`
+
+## example
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -pr_i    input data type. fp16/fp32 (representing 8/16/32 bit data) (default:fp16)
+       -pr_w    output weight data type(currently only fp32 supported now) (default:fp32)
+          -t    number of input tokens (default:32)
+          -e    number of experts (default:8)
+          -k    topk (default:2)
+       -st_i    row stride of input, -1 means same as experts (default:-1)
+       -st_o    row stride of output/indices, -1 means same as topk (default:-1)
+       -seed    seed to be used, -1 means random every time (default:-1)
+      -kname    when set to 1 it will print kernel name (default:0)
+
+```
diff --git a/example/ck_tile/09_topk_softmax/script/smoke_test.sh b/example/ck_tile/09_topk_softmax/script/smoke_test.sh
new file mode 100644
index 000000000..646f5889f
--- /dev/null
+++ b/example/ck_tile/09_topk_softmax/script/smoke_test.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+EXE=./build/bin/tile_example_topk_softmax
+
+for pr_i in "fp16" "bf16" ; do
+$EXE -pr_i=$pr_i -t=80 -e=17
+$EXE -pr_i=$pr_i -t=111 -e=117
+$EXE -pr_i=$pr_i -t=1000 -e=55
+$EXE -pr_i=$pr_i -t=99 -e=180
+$EXE -pr_i=$pr_i -t=175 -e=64 -k=8
+$EXE -pr_i=$pr_i -t=65 -e=8 -k=2
+$EXE -pr_i=$pr_i -t=1 -e=25
+$EXE -pr_i=$pr_i -t=31 -e=19 -k=15
+$EXE -pr_i=$pr_i -t=81 -e=37 -k=7
+$EXE -pr_i=$pr_i -t=199 -e=128 -k=13
+$EXE -pr_i=$pr_i -t=23 -e=1 -k=1
+$EXE -pr_i=$pr_i -t=127 -e=99 -k=19 -st_i=233 -st_o=31
+$EXE -pr_i=$pr_i -t=71 -e=11 -k=11 -st_i=30 -st_o=12
+$EXE -pr_i=$pr_i -t=1 -e=1 -k=1
+$EXE -pr_i=$pr_i -t=99 -e=2 -k=1 -st_i=11 -st_o=5
+$EXE -pr_i=$pr_i -t=333 -e=99 -k=13 -st_i=191 -st_o=17
+done
diff --git a/example/ck_tile/09_topk_softmax/topk_softmax.cpp b/example/ck_tile/09_topk_softmax/topk_softmax.cpp
new file mode 100644
index 000000000..6fc25631f
--- /dev/null
+++ b/example/ck_tile/09_topk_softmax/topk_softmax.cpp
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "topk_softmax_api.hpp"
+
+#if 0
+template <typename T>
+void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x)
+{
+    auto len = x.get_lengths();
+    assert(len.size() == 2);
+    std::cout << "[";
+    for(size_t i = 0; i < len[0]; i++)
+    {
+        std::cout << i << ": [";
+        for(size_t j = 0; j < len[1]; j++)
+        {
+            if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
+            {
+                auto v = ck_tile::type_convert<float>(x(i, j));
+
+                std::cout << v;
+                if(j != len[1] - 1)
+                    std::cout << ",";
+            }
+            else
+            {
+                std::cout << x(i, j) << " ";
+            }
+        }
+        std::cout << "]";
+        if(i != len[0] - 1)
+            std::cout << ",";
+        else
+            std::cout << "]";
+        std::cout << std::endl;
+    }
+    std::cout << "--------------------" << std::endl;
+}
+#endif
+
+// CPU reference
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+auto reference_topk_softmax(const ck_tile::HostTensor<InputType>& x,
+                            ck_tile::index_t k,
+                            ck_tile::index_t dim = -1,
+                            bool largest         = true,
+                            bool sorted          = true)
+{
+    using namespace ck_tile;
+
+    auto y = reference_softmax<InputType, WeightType, WeightType>(x, dim);
+
+    auto [y_values, y_indices] = reference_topk(y, k, dim, largest, sorted);
+
+    return ck_tile::make_tuple(y_values, y_indices);
+}
+
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+auto reference_topk_softmax(const ck_tile::HostTensor<InputType>& x,
+                            ck_tile::HostTensor<WeightType>& y_values,
+                            ck_tile::HostTensor<IndexType>& y_indices,
+                            ck_tile::index_t k,
+                            ck_tile::index_t dim = -1,
+                            bool largest         = true,
+                            bool sorted          = true)
+{
+    using namespace ck_tile;
+
+    auto y = reference_softmax<InputType, WeightType, WeightType>(x, dim);
+    reference_topk(y, y_values, y_indices, k, dim, largest, sorted);
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("pr_i", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
+        .insert("t", "32", "number of input tokens")
+        .insert("e", "8", "number of experts")
+        .insert("k", "2", "topk")
+        .insert("st_i", "-1", "row stride of input, -1 means same as experts")
+        .insert("st_o", "-1", "row stride of output/indices, -1 means same as topk")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "when set to 1 it will print kernel name")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+bool test_topk_softmax(ck_tile::ArgParser args)
+{
+    int validate            = args.get_int("v");
+    std::string input_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+    int tokens              = args.get_int("t");
+    int experts             = args.get_int("e");
+    int topk                = args.get_int("k");
+    int seed                = args.get_int("seed");
+    int stride_input        = args.get_int("st_i");
+    int stride_output       = args.get_int("st_o");
+    int kname               = args.get_int("kname");
+    int warmup              = args.get_int("warmup");
+    int repeat              = args.get_int("repeat");
+
+    if(stride_input < 0)
+    {
+        stride_input = experts;
+    }
+    if(stride_output < 0)
+    {
+        stride_output = topk;
+    }
+    assert(stride_input >= experts);
+    assert(stride_output >= topk);
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    if(topk > experts)
+    {
+        printf("topk:%d value should be smaller than, or equal to number of experts:%d\n",
+               topk,
+               experts);
+        return false;
+    }
+
+    // tokens already considered batch size
+    ck_tile::HostTensor<InputType> x_host({tokens, experts}, {stride_input, 1});
+    ck_tile::HostTensor<WeightType> value_host({tokens, topk}, {stride_output, 1});
+    ck_tile::HostTensor<IndexType> index_host({tokens, topk}, {stride_output, 1});
+
+    {
+        // random require per-row unique
+        auto rand_gen = ck_tile::FillUniformDistribution_Unique<InputType>{
+            -5.f, 5.f, static_cast<uint32_t>(seed)};
+
+        for(int i_t = 0; i_t < tokens; i_t++)
+        {
+            ck_tile::HostTensor<InputType> x_row({experts});
+            rand_gen(x_row);
+            std::copy(x_row.begin(), x_row.end(), x_host.begin() + i_t * stride_input);
+            rand_gen.clear();
+        }
+    }
+
+    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem value_dev(value_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem index_dev(index_host.get_element_space_size_in_bytes());
+
+    x_dev.ToDevice(x_host.data());
+
+    topk_softmax_trait trait{input_prec, weight_prec, experts};
+
+    topk_softmax_kargs karg{x_dev.GetDeviceBuffer(),
+                            value_dev.GetDeviceBuffer(),
+                            index_dev.GetDeviceBuffer(),
+                            tokens,
+                            experts,
+                            topk,
+                            stride_input,
+                            stride_output};
+
+    ck_tile::stream_config sc{nullptr,
+                              true,
+                              /* log_level = */ (kname ? 1 : 0),
+                              warmup,
+                              repeat};
+    auto ms = topk_softmax(trait, karg, sc);
+    printf("[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, st_o:%d, ms:%f, ",
+           input_prec.c_str(),
+           weight_prec.c_str(),
+           tokens,
+           experts,
+           topk,
+           stride_input,
+           stride_output,
+           ms);
+    if(ms < 0)
+        printf("not supported\n");
+    fflush(stdout);
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    value_dev.FromDevice(value_host.data());
+    index_dev.FromDevice(index_host.data());
+
+    bool rtn = true;
+    if(validate)
+    {
+        ck_tile::HostTensor<WeightType> value_ref({tokens, topk}, {stride_output, 1});
+        ck_tile::HostTensor<IndexType> index_ref({tokens, topk}, {stride_output, 1});
+
+        reference_topk_softmax<InputType, WeightType, IndexType>(
+            x_host, value_ref, index_ref, topk);
+
+        auto [rtol, atol] = get_elimit<InputType>("");
+        for(int i_t = 0; i_t < tokens; i_t++)
+        {
+            auto s_begin = std::vector<size_t>{static_cast<size_t>(i_t), static_cast<size_t>(0)};
+            auto s_end =
+                std::vector<size_t>{static_cast<size_t>(i_t + 1), static_cast<size_t>(topk)};
+            auto s_value_host = value_host.slice(s_begin, s_end);
+            auto s_value_ref  = value_ref.slice(s_begin, s_end);
+            rtn &= ck_tile::check_err(s_value_host,
+                                      s_value_ref,
+                                      std::string("[") + std::to_string(i_t) +
+                                          std::string("] Value Error:"),
+                                      rtol,
+                                      atol);
+            auto s_index_host = index_host.slice(s_begin, s_end);
+            auto s_index_ref  = index_ref.slice(s_begin, s_end);
+            rtn &= ck_tile::check_err(s_index_host,
+                                      s_index_ref,
+                                      std::string("[") + std::to_string(i_t) +
+                                          std::string("] Index Error:"),
+                                      rtol,
+                                      atol);
+        }
+    }
+
+    printf("valid:%s\n", rtn ? "y" : "n");
+    fflush(stdout);
+    return rtn;
+}
+
+int main(int argc, char** argv)
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return -1;
+    std::string input_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+
+    bool r = true;
+    if(input_prec.compare("fp16") == 0 && weight_prec.compare("fp32") == 0)
+    {
+        r &= test_topk_softmax<ck_tile::fp16_t, float, ck_tile::index_t>(args);
+    }
+    else if(input_prec.compare("bf16") == 0 && weight_prec.compare("fp32") == 0)
+    {
+        r &= test_topk_softmax<ck_tile::bf16_t, float, ck_tile::index_t>(args);
+    }
+
+    return r ? 0 : -1;
+}
diff --git a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
new file mode 100644
index 000000000..249a307b8
--- /dev/null
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "topk_softmax_api.hpp"
+
+#define TOPK_SOFTMAX_DISPATCH(experts_)                                                         \
+    constexpr ck_tile::index_t ts_experts = experts_;                                           \
+    using ts_problem                      = ck_tile::                                           \
+        TopkSoftmaxWarpPerRowProblem<ts_input_type, ts_weight_type, ts_index_type, ts_experts>; \
+    using ts_pipeline = ck_tile::TopkSoftmaxWarpPerRowPipeline<ts_problem>;                     \
+                                                                                                \
+    using kernel = ck_tile::TopkSoftmaxKernel<ts_pipeline>;                                     \
+                                                                                                \
+    auto kargs = kernel::MakeKargs(a);                                                          \
+                                                                                                \
+    const dim3 grids      = kernel::GridSize(a);                                                \
+    constexpr dim3 blocks = kernel::BlockSize();                                                \
+                                                                                                \
+    float ave_time = ck_tile::launch_kernel(                                                    \
+        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));               \
+                                                                                                \
+    return ave_time;
+
+float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s)
+{
+    if(t.input_type == "fp16" && t.weight_type == "fp32")
+    {
+        using ts_input_type  = ck_tile::fp16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+#if 1
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192)
+        }
+#else
+        if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128)
+        }
+#endif
+    }
+    else if(t.input_type == "bf16" && t.weight_type == "fp32")
+    {
+#if 1
+        using ts_input_type  = ck_tile::bf16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192)
+        }
+#endif
+    }
+    return -1;
+}
diff --git a/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp b/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp
new file mode 100644
index 000000000..65651efa4
--- /dev/null
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/topk_softmax.hpp"
+#include <string>
+
+struct topk_softmax_trait
+{
+    std::string input_type;
+    std::string weight_type; // currently always float
+    int experts;
+};
+
+struct topk_softmax_kargs : public ck_tile::TopkSoftmaxHostArgs
+{
+};
+
+float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s);
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index ec4a175d3..366fb18a0 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -7,3 +7,5 @@ add_subdirectory(02_layernorm2d)
 add_subdirectory(03_gemm)
 add_subdirectory(04_img2col)
 add_subdirectory(05_reduce)
+add_subdirectory(09_topk_softmax)
+
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index d96f14710..56dfbd636 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -49,6 +49,7 @@
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/tensor/tile_elementwise.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
diff --git a/include/ck_tile/core/algorithm/space_filling_curve.hpp b/include/ck_tile/core/algorithm/space_filling_curve.hpp
index 77a635611..6591acddb 100644
--- a/include/ck_tile/core/algorithm/space_filling_curve.hpp
+++ b/include/ck_tile/core/algorithm/space_filling_curve.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -81,8 +81,10 @@ struct space_filling_curve
         return get_step_between(number<AccessIdx1d>{}, number<AccessIdx1d - 1>{});
     }
 
+    // Do not use this function directly!
+    // TODO: can refactor into generic lambda in the future
     template <index_t AccessIdx1d>
-    static CK_TILE_HOST_DEVICE constexpr Index get_index(number<AccessIdx1d>)
+    static CK_TILE_HOST_DEVICE constexpr Index _get_index(number<AccessIdx1d>)
     {
 #if 0
         /*
@@ -153,11 +155,11 @@ struct space_filling_curve
         return idx_md;
     }
 
-    // FIXME: rename this function
+    // FIXME: return tuple of number<>, which is compile time only variable
     template <index_t AccessIdx1d>
-    static CK_TILE_HOST_DEVICE constexpr auto get_index_tuple_of_number(number<AccessIdx1d>)
+    static CK_TILE_HOST_DEVICE constexpr auto get_index(number<AccessIdx1d>)
     {
-        constexpr auto idx = get_index(number<AccessIdx1d>{});
+        constexpr auto idx = _get_index(number<AccessIdx1d>{});
 
         return generate_tuple([&](auto i) { return number<idx[i]>{}; }, number<nDim>{});
     }
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 7f488d1b7..3feede4d2 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -621,6 +621,99 @@ CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
     asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
 }
 
+namespace impl {
+// below type indicate the data type used for buffer load inline asm
+// clang-format off
+template<index_t N, typename T> struct smem_load_trait;
+
+template<typename T> struct smem_load_trait<16, T> { using payload_t = fp32x4_t; };
+template<typename T> struct smem_load_trait<8 , T> { using payload_t = fp32x2_t; };
+template<typename T> struct smem_load_trait<4 , T> { using payload_t = float; };
+template<typename T> struct smem_load_trait<2 , T> { using payload_t = float; };
+template<typename T> struct smem_load_trait<1 , T> { using payload_t = float; };
+
+// clang-format on
+} // namespace impl
+
+// NOTE: smem load/store no need pre_nop to make sure dependency by sw, happy :)
+template <index_t>
+struct smem_load;
+
+template <>
+struct smem_load<16>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 16);
+        using mbuf_t = typename impl::smem_load_trait<16, T>::payload_t;
+        asm volatile("ds_read_b128 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<8>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 8);
+        using mbuf_t = typename impl::smem_load_trait<8, T>::payload_t;
+        asm volatile("ds_read_b64 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<4>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::smem_load_trait<4, T>::payload_t;
+        asm volatile("ds_read_b32 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<2>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
+        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
+        asm volatile("ds_read_u16 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<1>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
+        asm volatile("ds_read_u8 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
 // clang-format off
 namespace impl{
 
@@ -976,6 +1069,16 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                        int soffset,    // dst_wave_addr_offset
                                        int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
+// Direct loads from global to LDS.
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
+                                __attribute__((address_space(3))) uint32_t* lds_ptr,
+                                index_t size,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t offset,
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
+
 template <bool pre_nop = false>
 CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
                                               int32x4_t rsrc,
@@ -1313,6 +1416,7 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
                                              int32x4_t src_wave_buffer_resource,
                                              index_t src_thread_addr_offset,
                                              index_t src_wave_addr_offset,
+                                             index_t src_linear_addr_offset,
                                              index_t flag           = 0,
                                              bool_constant<pre_nop> = {})
 {
@@ -1327,7 +1431,7 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
                                                 src_wave_buffer_resource,
                                                 src_thread_addr_offset,
                                                 src_wave_addr_offset,
-                                                0,
+                                                src_linear_addr_offset,
                                                 flag,
                                                 bool_constant<pre_nop>{});
     }
@@ -1337,7 +1441,7 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
                                              src_wave_buffer_resource,
                                              src_thread_addr_offset,
                                              src_wave_addr_offset,
-                                             0,
+                                             src_linear_addr_offset,
                                              flag,
                                              bool_constant<pre_nop>{});
     }
@@ -1365,6 +1469,43 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
                               bool_constant<pre_nop>{});
 }
 
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
+                                          int32x4_t src_wave_buffer_resource,
+                                          index_t src_thread_addr_offset,
+                                          index_t src_wave_addr_offset,
+                                          index_t src_immediate_addr_offset    = 0,
+                                          index_t flag                         = 0,
+                                          bool_constant<oob_conditional_check> = {})
+{
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
+
+    if constexpr(oob_conditional_check)
+    {
+        index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        v_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+    else
+    {
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        src_thread_addr_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+}
+
 template <index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
 CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
@@ -1685,6 +1826,7 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thr
                                               int32x4_t dst_wave_buffer_resource,
                                               index_t dst_thread_addr_offset,
                                               index_t dst_wave_addr_offset,
+                                              index_t dst_linear_addr_offset,
                                               index_t is_valid_element = 1)
 {
     constexpr index_t bytes = sizeof(T) * N;
@@ -1698,7 +1840,7 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thr
                                         dst_wave_buffer_resource,
                                         dst_thread_addr_offset,
                                         dst_wave_addr_offset,
-                                        0,
+                                        dst_linear_addr_offset,
                                         is_valid_element);
     }
     else
@@ -1707,7 +1849,7 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thr
                                      dst_wave_buffer_resource,
                                      dst_thread_addr_offset,
                                      dst_wave_addr_offset,
-                                     0);
+                                     dst_linear_addr_offset);
     }
 }
 
@@ -2014,6 +2156,7 @@ template <typename T,
 CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
                                         const T* p_src_wave,
                                         index_t src_thread_element_offset,
+                                        index_t src_linear_element_offset,
                                         index_t src_element_space_size,
                                         index_t is_valid_element = 0,
                                         bool_constant<pre_nop>   = {})
@@ -2022,12 +2165,14 @@ CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
         make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
 
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
 
     amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
         dst,
         src_wave_buffer_resource,
         src_thread_addr_offset,
         0,
+        src_linear_addr_offset,
         is_valid_element,
         bool_constant<pre_nop>{});
 }
@@ -2041,16 +2186,19 @@ template <typename T,
 CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
                                         const int32x4_t src_wave_buffer_resource,
                                         index_t src_thread_element_offset,
+                                        index_t src_linear_element_offset,
                                         index_t is_valid_element = 0,
                                         bool_constant<pre_nop>   = {})
 {
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
 
     amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
         dst,
         src_wave_buffer_resource,
         src_thread_addr_offset,
         0,
+        src_linear_addr_offset,
         is_valid_element,
         bool_constant<pre_nop>{});
 }
@@ -2066,6 +2214,7 @@ template <typename T,
 CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
                                                        const T* p_src_wave,
                                                        index_t src_thread_element_offset,
+                                                       index_t src_linear_element_offset,
                                                        index_t src_element_space_size,
                                                        bool_constant<pre_nop> = {})
 {
@@ -2073,9 +2222,14 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
         make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
 
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
 
-    amd_async_buffer_load_impl<T, N, coherence>(
-        smem, src_wave_buffer_resource, src_thread_addr_offset, 0, 0, bool_constant<pre_nop>{});
+    amd_async_buffer_load_impl<T, N, coherence>(smem,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                0,
+                                                src_linear_addr_offset,
+                                                bool_constant<pre_nop>{});
 }
 
 // This version support buffer resource as input arg
@@ -2086,12 +2240,42 @@ template <typename T,
 CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
                                                        const int32x4_t src_wave_buffer_resource,
                                                        index_t src_thread_element_offset,
+                                                       index_t src_linear_element_offset,
                                                        bool_constant<pre_nop> = {})
 {
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
 
-    amd_async_buffer_load_impl<T, N, coherence>(
-        smem, src_wave_buffer_resource, src_thread_addr_offset, 0, 0, bool_constant<pre_nop>{});
+    amd_async_buffer_load_impl<T, N, coherence>(smem,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                0,
+                                                src_linear_addr_offset,
+                                                bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
+                                                   const int32x4_t src_wave_buffer_resource,
+                                                   index_t src_thread_element_offset,
+                                                   index_t src_linear_element_offset,
+                                                   bool is_valid_element,
+                                                   bool_constant<oob_conditional_check> = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_async_buffer_load<T, N, coherence>(smem,
+                                           src_wave_buffer_resource,
+                                           src_thread_addr_offset,
+                                           0,
+                                           src_linear_addr_offset,
+                                           is_valid_element,
+                                           bool_constant<oob_conditional_check>{});
 }
 
 // buffer_store requires:
@@ -2146,6 +2330,7 @@ template <typename T,
 CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_data,
                                          T* p_dst_wave,
                                          const index_t dst_thread_element_offset,
+                                         const index_t dst_linear_element_offset,
                                          const bool dst_thread_element_valid,
                                          const index_t dst_element_space_size)
 {
@@ -2153,11 +2338,13 @@ CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_d
         make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
 
     index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
 
     amd_buffer_store_raw_impl<T, N, coherence, oob_conditional_check>(src_thread_data,
                                                                       dst_wave_buffer_resource,
                                                                       dst_thread_addr_offset,
                                                                       0,
+                                                                      dst_linear_addr_offset,
                                                                       dst_thread_element_valid);
 }
 
@@ -2221,16 +2408,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 #endif
 }
 
-// Direct loads from global to LDS.
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
-                                __attribute__((address_space(3))) uint32_t* lds_ptr,
-                                index_t size,
-                                index_t voffset,
-                                index_t soffset,
-                                index_t offset,
-                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
-
 template <typename T, index_t NumElemsPerThread>
 CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                                   const index_t global_offset,
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 580faae92..4be50b865 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -41,6 +41,19 @@
 #define CK_TILE_HOST_DEVICE_EXTERN
 #endif
 
+// implementing the "memory address space" attribute
+// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
+#ifdef __HIPCC_
+#define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
+#define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
+#define CK_TILE_LDS_ADDR __attribute__((address_space(3)))
+#define CK_TILE_BUF_RES_ADDR __attribute__((address_space(8)))
+#else
+#define CK_TILE_GENERIC_ADDR
+#define CK_TILE_GLOBAL_ADDR
+#define CK_TILE_LDS_ADDR
+#define CK_TILE_BUF_RES_ADDR
+#endif
 #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE
 #define CK_TILE_USE_CUSTOM_DATA_TYPE 0 // custom data type will generate extra move/bfi code
 #endif
@@ -205,3 +218,8 @@
 #ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
 #define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
 #endif
+
+// workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
+#ifndef CK_TILE_WORKAROUND_SWDEV_383542
+#define CK_TILE_WORKAROUND_SWDEV_383542 1
+#endif
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index 598dfeea3..19d853ad5 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -623,7 +623,7 @@ template <typename... Ys,
               false>
 CK_TILE_HOST_DEVICE constexpr auto operator+=(tuple<Ys...>& y, const X& x)
 {
-    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    static_assert(X::size() == sizeof...(Ys), "wrong! size not the same");
     constexpr index_t NSize = sizeof...(Ys);
     static_for<0, NSize, 1>{}([&](auto i) { y[i] += x[i]; });
     return y;
@@ -635,7 +635,7 @@ template <typename... Ys,
               false>
 CK_TILE_HOST_DEVICE constexpr auto operator-=(tuple<Ys...>& y, const X& x)
 {
-    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    static_assert(X::size() == sizeof...(Ys), "wrong! size not the same");
     constexpr index_t NSize = sizeof...(Ys);
     static_for<0, NSize, 1>{}([&](auto i) { y[i] -= x[i]; });
     return y;
@@ -647,7 +647,7 @@ template <typename... Xs,
               false>
 CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple<Xs...>& x, const Y& y)
 {
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same");
     constexpr index_t NSize = sizeof...(Xs);
 
     tuple<Xs...> r;
@@ -655,13 +655,21 @@ CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple<Xs...>& x, const Y& y)
     return r;
 }
 
+template <typename... Xs, typename... Ys>
+CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple<Xs...>& x, const tuple<Ys...>& y)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!");
+    constexpr index_t NSize = sizeof...(Xs);
+    return generate_tuple([&](auto i) { return x[i] + y[i]; }, number<NSize>{});
+}
+
 template <typename... Xs,
           typename Y,
           std::enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> =
               false>
 CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple<Xs...>& x, const Y& y)
 {
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same");
     constexpr index_t NSize = sizeof...(Xs);
 
     tuple<Xs...> r;
@@ -669,13 +677,21 @@ CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple<Xs...>& x, const Y& y)
     return r;
 }
 
+template <typename... Xs, typename... Ys>
+CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple<Xs...>& x, const tuple<Ys...>& y)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!");
+    constexpr index_t NSize = sizeof...(Xs);
+    return generate_tuple([&](auto i) { return x[i] - y[i]; }, number<NSize>{});
+}
+
 template <typename... Xs,
           typename Y,
           std::enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> =
               false>
 CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple<Xs...>& x, const Y& y)
 {
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same");
     constexpr index_t NSize = sizeof...(Xs);
 
     tuple<Xs...> r;
@@ -706,6 +722,14 @@ CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple<Xs...>& x, Y a)
     return a * x;
 }
 
+template <typename... Xs, typename... Ys>
+CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple<Xs...>& x, const tuple<Ys...>& y)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!");
+    constexpr index_t NSize = sizeof...(Xs);
+    return generate_tuple([&](auto i) { return x[i] * y[i]; }, number<NSize>{});
+}
+
 template <typename... Xs, typename... Ys>
 CK_TILE_HOST_DEVICE constexpr auto operator/(const tuple<Xs...>& x, const tuple<Ys...>& y)
 {
diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp
index f512e50e0..785691b66 100644
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
@@ -487,55 +487,12 @@ struct log2e<float>
 template <typename T = double>
 constexpr T log2e_v = log2e<T>::value;
 
-// math
-CK_TILE_HOST_DEVICE
-float abs(const float& x)
-{
-    union
-    {
-        float f32;
-        uint32_t u32;
-    } y;
-    y.f32 = x;
-    y.u32 = y.u32 & 0x7fffffff;
-    return y.f32;
-}
-
-CK_TILE_HOST_DEVICE
-bool isnan(const float& x)
-{
-    uint32_t xx = bit_cast<uint32_t>(x);
-    return (xx & 0x7fffffff) > 0x7F800000;
-}
-
-CK_TILE_HOST float sqrt(float x) { return std::sqrt(x); };
-
-CK_TILE_HOST double sqrt(double x) { return std::sqrt(x); };
-
-CK_TILE_DEVICE
-float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); };
-
-CK_TILE_DEVICE
-double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
-
-CK_TILE_DEVICE
-float exp(float x) { return __ocml_exp_f32(x); };
-
-CK_TILE_HOST
-float exp(float x) { return std::expf(x); }
-
 CK_TILE_DEVICE
 float exp2(float x) { return exp2f(x); };
 
 CK_TILE_HOST
 float exp2(float x) { return std::exp2f(x); };
 
-CK_TILE_DEVICE
-float log(float x) { return __logf(x); };
-
-CK_TILE_HOST
-float log(float x) { return std::logf(x); };
-
 CK_TILE_DEVICE uint16_t sad_u16(uint16_t x, uint16_t y, uint16_t acc)
 {
     return __builtin_amdgcn_sad_u16(x, y, acc);
@@ -554,4 +511,933 @@ CK_TILE_HOST uint32_t sad_u32(uint32_t x, uint32_t y, uint32_t acc)
     return (x > y ? (x - y) : (y - x)) + acc;
 }
 
+///////////////////////////////////////////////////////////////
+
+} // namespace ck_tile
+// blow function need data type pre-defined
+#include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/float8.hpp"
+#include "ck_tile/core/numeric/type_convert.hpp"
+#ifndef __HIP_DEVICE_COMPILE__
+#include <cmath>
+#endif
+
+namespace ck_tile {
+#if CK_TILE_WORKAROUND_SWDEV_383542
+extern "C" CK_TILE_DEVICE float __ocml_native_recip_f32(float);
+#endif
+
+// math functions for the host,  some are implemented by calling C++ std functions
+
+CK_TILE_HOST float abs(float x) { return std::abs(x); };
+
+CK_TILE_HOST double abs(double x) { return std::abs(x); };
+
+CK_TILE_HOST int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+CK_TILE_HOST int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+CK_TILE_HOST fp16_t abs(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    uint16_t abs_xx = xx & 0x7fff;
+
+    fp16_t abs_x = bit_cast<fp16_t>(abs_xx);
+
+    return abs_x;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_HOST int4_t abs(int4_t x)
+{
+    int4_t sgn = x >> (4 - 1);
+    return (x ^ sgn) - sgn;
+}
+#endif
+
+CK_TILE_HOST bool isnan(float x) { return std::isnan(x); };
+
+CK_TILE_HOST bool isnan(double x) { return std::isnan(x); };
+
+CK_TILE_HOST bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+CK_TILE_HOST bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+CK_TILE_HOST bool isnan(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    return (xx & 0x7FFF) > 0x7C00;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_HOST bool isnan(int4_t x)
+{
+    (void)x;
+    return false;
+};
+#endif
+
+CK_TILE_HOST fp16_t sqrt(fp16_t x)
+{
+    return static_cast<fp16_t>(std::sqrt(static_cast<float>(x)));
+};
+
+CK_TILE_HOST float sqrt(float x) { return std::sqrt(x); };
+
+CK_TILE_HOST double sqrt(double x) { return std::sqrt(x); };
+
+template <typename T>
+CK_TILE_HOST T tanh(T x)
+{
+    return type_convert<T>(std::tanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float tanh<float>(float x)
+{
+    return std::tanhf(x);
+};
+
+template <>
+CK_TILE_HOST double tanh<double>(double x)
+{
+    return std::tanh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T acos(T x)
+{
+    return type_convert<T>(std::acosf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float acos<float>(float x)
+{
+    return std::acosf(x);
+};
+
+template <>
+CK_TILE_HOST double acos<double>(double x)
+{
+    return std::acos(x);
+};
+
+template <typename T>
+CK_TILE_HOST T neg(T x)
+{
+    return type_convert<T>(-(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float neg<float>(float x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_HOST double neg<double>(double x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_HOST int32_t neg<int32_t>(int32_t x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_HOST int8_t neg<int8_t>(int8_t x)
+{
+    return -x;
+};
+
+template <typename T>
+CK_TILE_HOST T atan(T x)
+{
+    return type_convert<T>(std::atanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float atan<float>(float x)
+{
+    return std::atanf(x);
+};
+
+template <>
+CK_TILE_HOST double atan<double>(double x)
+{
+    return std::atan(x);
+};
+
+template <typename T>
+CK_TILE_HOST T sin(T x)
+{
+    return type_convert<T>(std::sinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float sin<float>(float x)
+{
+    return std::sinf(x);
+};
+
+template <>
+CK_TILE_HOST double sin<double>(double x)
+{
+    return std::sin(x);
+};
+
+template <typename T>
+CK_TILE_HOST T asin(T x)
+{
+    return type_convert<T>(std::asinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float asin<float>(float x)
+{
+    return std::asinf(x);
+};
+
+template <>
+CK_TILE_HOST double asin<double>(double x)
+{
+    return std::asin(x);
+};
+
+template <typename T>
+CK_TILE_HOST T asinh(T x)
+{
+    return type_convert<T>(std::asinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float asinh<float>(float x)
+{
+    return std::asinhf(x);
+};
+
+template <>
+CK_TILE_HOST double asinh<double>(double x)
+{
+    return std::asinh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T cos(T x)
+{
+    return type_convert<T>(std::cosf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float cos<float>(float x)
+{
+    return std::cosf(x);
+};
+
+template <>
+CK_TILE_HOST double cos<double>(double x)
+{
+    return std::cos(x);
+};
+
+template <typename T>
+CK_TILE_HOST T acosh(T x)
+{
+    return type_convert<T>(std::acoshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float acosh<float>(float x)
+{
+    return std::acoshf(x);
+};
+
+template <>
+CK_TILE_HOST double acosh<double>(double x)
+{
+    return std::acosh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T tan(T x)
+{
+    return type_convert<T>(std::tanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float tan<float>(float x)
+{
+    return std::tanf(x);
+};
+
+template <>
+CK_TILE_HOST double tan<double>(double x)
+{
+    return std::tan(x);
+};
+
+template <typename T>
+CK_TILE_HOST T atanh(T x)
+{
+    return type_convert<T>(std::atanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float atanh<float>(float x)
+{
+    return std::atanhf(x);
+};
+
+template <>
+CK_TILE_HOST double atanh<double>(double x)
+{
+    return std::atanh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T sinh(T x)
+{
+    return type_convert<T>(std::sinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float sinh<float>(float x)
+{
+    return std::sinhf(x);
+};
+
+template <>
+CK_TILE_HOST double sinh<double>(double x)
+{
+    return std::sinh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T ceil(T x)
+{
+    return type_convert<T>(std::ceilf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float ceil<float>(float x)
+{
+    return std::ceilf(x);
+};
+
+template <>
+CK_TILE_HOST double ceil<double>(double x)
+{
+    return std::ceil(x);
+};
+
+template <typename T>
+CK_TILE_HOST T cosh(T x)
+{
+    return type_convert<T>(std::coshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float cosh<float>(float x)
+{
+    return std::coshf(x);
+};
+
+template <>
+CK_TILE_HOST double cosh<double>(double x)
+{
+    return std::cosh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T floor(T x)
+{
+    return type_convert<T>(std::floorf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float floor<float>(float x)
+{
+    return std::floorf(x);
+};
+
+template <>
+CK_TILE_HOST double floor<double>(double x)
+{
+    return std::floor(x);
+};
+
+template <typename T>
+CK_TILE_HOST T rcp(T x)
+{
+    return type_convert<T>(1.f / type_convert<float>(x));
+};
+
+template <typename T>
+CK_TILE_HOST T exp(T x)
+{
+    return type_convert<T>(std::expf(type_convert<float>(x)));
+}
+
+template <>
+CK_TILE_HOST float exp<float>(float x)
+{
+    return std::expf(x);
+}
+
+template <>
+CK_TILE_HOST double exp<double>(double x)
+{
+    return std::exp(x);
+}
+
+template <typename T>
+CK_TILE_HOST T log(T x)
+{
+    return type_convert<T>(std::logf(type_convert<float>(x)));
+}
+
+template <>
+CK_TILE_HOST float log<float>(float x)
+{
+    return std::logf(x);
+}
+
+template <>
+CK_TILE_HOST double log<double>(double x)
+{
+    return std::log(x);
+}
+
+template <typename T>
+CK_TILE_HOST T pow(T x, T gamma)
+{
+    return type_convert<T>(std::powf(type_convert<float>(x), type_convert<float>(gamma)));
+}
+
+template <>
+CK_TILE_HOST float pow<float>(float x, float gamma)
+{
+    return std::powf(x, gamma);
+}
+
+template <>
+CK_TILE_HOST double pow<double>(double x, double gamma)
+{
+    return std::pow(x, gamma);
+}
+
+template <typename T>
+CK_TILE_HOST T expm1(T x)
+{
+    return type_convert<T>(std::expm1f(type_convert<float>(x)));
+}
+
+template <>
+CK_TILE_HOST float expm1<float>(float x)
+{
+    return std::expm1f(x);
+}
+
+template <>
+CK_TILE_HOST double expm1<double>(double x)
+{
+    return std::expm1(x);
+}
+
+// math functions for the HIP kernel,  some are implemented by calling hip builtin functions
+
+CK_TILE_DEVICE float abs(float x)
+{
+    union
+    {
+        float f32;
+        uint32_t u32;
+    } y;
+    y.f32 = x;
+    y.u32 = y.u32 & 0x7fffffff;
+    return y.f32;
+};
+
+CK_TILE_DEVICE double abs(double x) { return ::abs(x); };
+
+CK_TILE_DEVICE int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+CK_TILE_DEVICE int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_DEVICE int4_t abs(int4_t x)
+{
+    int4_t sgn = x >> (4 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+#endif
+
+CK_TILE_DEVICE fp16_t abs(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    uint16_t abs_xx = xx & 0x7fff;
+
+    fp16_t abs_x = bit_cast<fp16_t>(abs_xx);
+
+    return abs_x;
+};
+
+CK_TILE_DEVICE bool isnan(float x) { return ::isnan(x); };
+
+CK_TILE_DEVICE bool isnan(double x) { return ::isnan(x); };
+
+CK_TILE_DEVICE bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+CK_TILE_DEVICE bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_DEVICE bool isnan(int4_t x)
+{
+    (void)x;
+    return false;
+};
+#endif
+
+CK_TILE_DEVICE bool isnan(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    return (xx & 0x7FFF) > 0x7C00;
+};
+
+CK_TILE_DEVICE fp16_t sqrt(fp16_t x)
+{
+    return static_cast<fp16_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x)));
+};
+
+CK_TILE_DEVICE float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); };
+
+CK_TILE_DEVICE double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
+
+template <typename T>
+CK_TILE_DEVICE T tanh(T x)
+{
+    return type_convert<T>(::tanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float tanh<float>(float x)
+{
+    return ::tanhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double tanh<double>(double x)
+{
+    return ::tanh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T acos(T x)
+{
+    return type_convert<T>(::acosf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float acos<float>(float x)
+{
+    return ::acosf(x);
+};
+
+template <>
+CK_TILE_DEVICE double acos<double>(double x)
+{
+    return ::acos(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T neg(T x)
+{
+    return type_convert<T>(-(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float neg<float>(float x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE double neg<double>(double x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE int32_t neg<int32_t>(int32_t x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE int8_t neg<int8_t>(int8_t x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE fp16_t neg<fp16_t>(fp16_t x)
+{
+    return __hneg(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T atan(T x)
+{
+    return type_convert<T>(::atanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float atan<float>(float x)
+{
+    return ::atanf(x);
+};
+
+template <>
+CK_TILE_DEVICE double atan<double>(double x)
+{
+    return ::atan(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T sin(T x)
+{
+    return type_convert<T>(::sinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float sin<float>(float x)
+{
+    return ::sinf(x);
+};
+
+template <>
+CK_TILE_DEVICE double sin<double>(double x)
+{
+    return ::sin(x);
+};
+
+template <>
+CK_TILE_DEVICE fp16_t sin<fp16_t>(fp16_t x)
+{
+    return ::hsin(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T asin(T x)
+{
+    return type_convert<T>(::asinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float asin<float>(float x)
+{
+    return ::asinf(x);
+};
+
+template <>
+CK_TILE_DEVICE double asin<double>(double x)
+{
+    return ::asin(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T asinh(T x)
+{
+    return type_convert<T>(::asinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float asinh<float>(float x)
+{
+    return ::asinhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double asinh<double>(double x)
+{
+    return ::asinh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T acosh(T x)
+{
+    return type_convert<T>(::acoshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float acosh<float>(float x)
+{
+    return ::acoshf(x);
+};
+
+template <>
+CK_TILE_DEVICE double acosh<double>(double x)
+{
+    return ::acosh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T tan(T x)
+{
+    return type_convert<T>(::tanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float tan<float>(float x)
+{
+    return ::tanf(x);
+};
+
+template <>
+CK_TILE_DEVICE double tan<double>(double x)
+{
+    return ::tan(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T atanh(T x)
+{
+    return type_convert<T>(::atanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float atanh<float>(float x)
+{
+    return ::atanhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double atanh<double>(double x)
+{
+    return ::atanh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T sinh(T x)
+{
+    return type_convert<T>(::sinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float sinh<float>(float x)
+{
+    return ::sinhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double sinh<double>(double x)
+{
+    return ::sinh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T ceil(T x)
+{
+    return type_convert<T>(::ceilf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float ceil<float>(float x)
+{
+    return ::ceilf(x);
+};
+
+template <>
+CK_TILE_DEVICE double ceil<double>(double x)
+{
+    return ::ceil(x);
+};
+
+template <>
+CK_TILE_DEVICE fp16_t ceil<fp16_t>(fp16_t x)
+{
+    return ::hceil(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T cosh(T x)
+{
+    return type_convert<T>(::coshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float cosh<float>(float x)
+{
+    return ::coshf(x);
+};
+
+template <>
+CK_TILE_DEVICE double cosh<double>(double x)
+{
+    return ::cosh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T floor(T x)
+{
+    return type_convert<T>(::floorf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float floor<float>(float x)
+{
+    return ::floorf(x);
+};
+
+template <>
+CK_TILE_DEVICE double floor<double>(double x)
+{
+    return ::floor(x);
+};
+
+template <>
+CK_TILE_DEVICE fp16_t floor<fp16_t>(fp16_t x)
+{
+    return ::hfloor(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T rcp(T x)
+{
+#if !CK_TILE_WORKAROUND_SWDEV_383542
+    return __frcp_rn(x);
+#else
+    // return __ocml_native_recip_f32(x);
+    return __builtin_amdgcn_rcpf(x);
+#endif
+};
+
+template <typename T>
+CK_TILE_DEVICE T exp(T x)
+{
+    return type_convert<T>(__ocml_exp_f32(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE fp16_t exp<fp16_t>(fp16_t x)
+{
+    return hexp(x);
+};
+
+template <>
+CK_TILE_DEVICE float exp<float>(float x)
+{
+    return __ocml_exp_f32(x);
+};
+
+template <>
+CK_TILE_DEVICE double exp<double>(double x)
+{
+    return exp(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T log(T x)
+{
+    return type_convert<T>(__logf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE fp16_t log<fp16_t>(fp16_t x)
+{
+    return hlog(x);
+};
+
+template <>
+CK_TILE_DEVICE float log<float>(float x)
+{
+    return __logf(x);
+};
+
+template <>
+CK_TILE_DEVICE double log<double>(double x)
+{
+    return log(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T pow(T x, T gamma)
+{
+    return type_convert<T>(powf(type_convert<float>(x), type_convert<float>(gamma)));
+};
+
+template <>
+CK_TILE_DEVICE float pow<float>(float x, float gamma)
+{
+    return powf(x, gamma);
+};
+
+template <>
+CK_TILE_DEVICE double pow<double>(double x, double gamma)
+{
+    return pow(x, gamma);
+};
+
+template <typename T>
+CK_TILE_DEVICE T expm1(T x)
+{
+    return type_convert<T>(expm1f(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float expm1<float>(float x)
+{
+    return expm1f(x);
+};
+
+template <>
+CK_TILE_DEVICE double expm1<double>(double x)
+{
+    return expm1(x);
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index ed705c91e..2cc788d42 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -91,8 +91,10 @@ struct buffer_view<address_space_enum::generic,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -107,11 +109,11 @@ struct buffer_view<address_space_enum::generic,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
             X tmp;
 
-            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+            __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
 
             return tmp;
 #else
-            return *c_style_pointer_cast<const X*>(&p_data_[i]);
+            return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
 #endif
         }
         else
@@ -134,17 +136,17 @@ struct buffer_view<address_space_enum::generic,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         if constexpr(Op == memory_operation_enum::set)
         {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
         }
         // FIXME: remove memory_operation_enum::add
         else if constexpr(Op == memory_operation_enum::add)
         {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
         }
     }
 
@@ -154,7 +156,7 @@ struct buffer_view<address_space_enum::generic,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -169,9 +171,9 @@ struct buffer_view<address_space_enum::generic,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
             X tmp = x;
 
-            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+            __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+            *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
         }
     }
@@ -276,8 +278,10 @@ struct buffer_view<address_space_enum::global,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -303,7 +307,7 @@ struct buffer_view<address_space_enum::global,
                                                                    t_per_x,
                                                                    Coherence,
                                                                    oob_conditional_check>(
-                    p_data_, i, is_valid_element, buffer_size_);
+                    p_data_, i + linear_offset, is_valid_element, buffer_size_);
             }
             else
             {
@@ -311,8 +315,11 @@ struct buffer_view<address_space_enum::global,
                     remove_cvref_t<T>,
                     t_per_x,
                     Coherence,
-                    oob_conditional_check>(
-                    p_data_, i, is_valid_element, buffer_size_, invalid_element_value_);
+                    oob_conditional_check>(p_data_,
+                                           i + linear_offset,
+                                           is_valid_element,
+                                           buffer_size_,
+                                           invalid_element_value_);
             }
         }
         else
@@ -322,11 +329,11 @@ struct buffer_view<address_space_enum::global,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
                 X tmp;
 
-                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+                __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
 
                 return tmp;
 #else
-                return *c_style_pointer_cast<const X*>(&p_data_[i]);
+                return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
 #endif
             }
             else
@@ -352,7 +359,8 @@ struct buffer_view<address_space_enum::global,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
     CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t<X>& dst,
-                                          index_t i,
+                                          index_t v_offset,
+                                          index_t i_offset,
                                           bool is_valid_element,
                                           bool_constant<pre_nop> = {}) const
     {
@@ -366,7 +374,38 @@ struct buffer_view<address_space_enum::global,
         constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
 
         amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
-            dst, cached_buf_res_, i, is_valid_element, bool_constant<pre_nop>{});
+            dst, cached_buf_res_, v_offset, i_offset, is_valid_element, bool_constant<pre_nop>{});
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto async_get(CK_TILE_LDS_ADDR remove_cvref_t<T>* smem,
+                                            index_t i,
+                                            index_t linear_offset,
+                                            bool is_valid_element,
+                                            bool_constant<oob_conditional_check> = {}) const
+    {
+        // X is vector of T
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+        amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
+            smem,
+            cached_buf_res_,
+            i,
+            linear_offset,
+            is_valid_element,
+            bool_constant<oob_conditional_check>{});
     }
 
     // i is offset of T, not X. i should be aligned to X
@@ -378,6 +417,7 @@ struct buffer_view<address_space_enum::global,
                   bool>::type = false>
     CK_TILE_DEVICE constexpr auto async_get_raw(remove_cvref_t<T>* smem,
                                                 index_t i,
+                                                index_t linear_offset,
                                                 bool /*is_valid_element*/,
                                                 bool_constant<pre_nop> = {}) const
     {
@@ -391,7 +431,7 @@ struct buffer_view<address_space_enum::global,
         constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
 
         amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
-            smem, cached_buf_res_, i, bool_constant<pre_nop>{});
+            smem, cached_buf_res_, i, linear_offset, bool_constant<pre_nop>{});
     }
 
     // i is offset of T, not X. i should be aligned to X
@@ -401,25 +441,25 @@ struct buffer_view<address_space_enum::global,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         if constexpr(Op == memory_operation_enum::set)
         {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
         }
         else if constexpr(Op == memory_operation_enum::atomic_add)
         {
-            this->template atomic_add<X>(i, is_valid_element, x);
+            this->template atomic_add<X>(i, linear_offset, is_valid_element, x);
         }
         else if constexpr(Op == memory_operation_enum::atomic_max)
         {
-            this->template atomic_max<X>(i, is_valid_element, x);
+            this->template atomic_max<X>(i, linear_offset, is_valid_element, x);
         }
         // FIXME: remove memory_operation_enum::add
         else if constexpr(Op == memory_operation_enum::add)
         {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
             // tmp += x;
             // this->template set<X>(i, is_valid_element, tmp);
         }
@@ -432,7 +472,7 @@ struct buffer_view<address_space_enum::global,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -453,7 +493,7 @@ struct buffer_view<address_space_enum::global,
             constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
 
             amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
         }
         else
         {
@@ -462,9 +502,9 @@ struct buffer_view<address_space_enum::global,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
                 X tmp = x;
 
-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+                __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
             }
         }
@@ -477,7 +517,7 @@ struct buffer_view<address_space_enum::global,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void set_raw(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -489,7 +529,7 @@ struct buffer_view<address_space_enum::global,
 
         constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
         amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
-            x, p_data_, i, is_valid_element, buffer_size_);
+            x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
     }
 
     template <typename X,
@@ -497,7 +537,8 @@ struct buffer_view<address_space_enum::global,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void atomic_add(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void
+    atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
 
@@ -532,13 +573,13 @@ struct buffer_view<address_space_enum::global,
         if constexpr(use_amd_buffer_addressing)
         {
             amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
         }
         else
         {
             if(is_valid_element)
             {
-                atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
+                atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
             }
         }
     }
@@ -548,7 +589,8 @@ struct buffer_view<address_space_enum::global,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void atomic_max(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void
+    atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -572,11 +614,11 @@ struct buffer_view<address_space_enum::global,
         if constexpr(use_amd_buffer_addressing)
         {
             amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
         }
         else if(is_valid_element)
         {
-            atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
+            atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
         }
     }
 
@@ -668,8 +710,10 @@ struct buffer_view<address_space_enum::lds,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -684,14 +728,14 @@ struct buffer_view<address_space_enum::lds,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
             X tmp;
 
-            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+            __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
 
             return tmp;
 #else
             using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
                                        scalar_per_t_vector * scalar_per_x_vector>;
             // using buf_t = ushort __attribute__((ext_vector_type(8)));
-            auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i]);
+            auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
             return bit_cast<X>(rtn);
 #endif
         }
@@ -708,6 +752,23 @@ struct buffer_view<address_space_enum::lds,
         }
     }
 
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t<X>& dst,
+                                          index_t v_offset,
+                                          index_t i_offset,
+                                          bool /*is_valid_element*/,
+                                          bool_constant<pre_nop> = {}) const
+    {
+        smem_load<sizeof(X)>{}(dst, v_offset * sizeof(T), i_offset * sizeof(T));
+    }
+
     // i is offset of T, not X. i should be aligned to X
     template <memory_operation_enum Op,
               typename X,
@@ -715,17 +776,17 @@ struct buffer_view<address_space_enum::lds,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         if constexpr(Op == memory_operation_enum::set)
         {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
         }
         // FIXME: remove memory_operation_enum::add
         else if constexpr(Op == memory_operation_enum::add)
         {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
         }
     }
 
@@ -735,7 +796,7 @@ struct buffer_view<address_space_enum::lds,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -751,6 +812,7 @@ struct buffer_view<address_space_enum::lds,
         bool constexpr workaround_int8_ds_write_issue = false;
 #endif
 
+        i += linear_offset; // simplicity
         if constexpr(std::is_same<typename vector_traits<remove_cvref_t<T>>::scalar_type,
                                   int8_t>::value &&
                      workaround_int8_ds_write_issue)
@@ -952,8 +1014,10 @@ struct buffer_view<address_space_enum::vgpr,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t /*linear_offset*/,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -995,17 +1059,17 @@ struct buffer_view<address_space_enum::vgpr,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         if constexpr(Op == memory_operation_enum::set)
         {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
         }
         // FIXME: remove memory_operation_enum::add
         else if constexpr(Op == memory_operation_enum::add)
         {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
         }
     }
 
@@ -1015,7 +1079,7 @@ struct buffer_view<address_space_enum::vgpr,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -1030,9 +1094,9 @@ struct buffer_view<address_space_enum::vgpr,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
             X tmp = x;
 
-            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+            __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+            *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
         }
     }
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index aeda5e9c0..06b5a8da0 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -12,6 +12,7 @@
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/null_tile_window.hpp"
 #include "ck_tile/core/tensor/null_tensor.hpp"
 
@@ -28,7 +29,21 @@ CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution<BottomT
                                                                          NumCoord>& tile_window,
                               bool_constant<oob_conditional_check> = {})
 {
-    return tile_window.load(bool_constant<oob_conditional_check>{});
+    return tile_window.load(number<-1>{}, bool_constant<oob_conditional_check>{});
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(const tile_window_linear<BottomTensorView_,
+                                                       WindowLengths_,
+                                                       TileDistribution_,
+                                                       LinearBottomDims_>& tile_window,
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(number<-1>{}, bool_constant<oob_conditional_check>{});
 }
 
 template <typename T,
@@ -46,7 +61,27 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
                                   bool_constant<oob_conditional_check> = {},
                                   bool_constant<pre_nop>               = {})
 {
-    tile_window.load_raw(tile, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+    tile_window.load_raw(
+        tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+}
+
+template <typename T,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE auto load_tile_raw(T& tile,
+                                  const tile_window_linear<BottomTensorView_,
+                                                           WindowLengths_,
+                                                           TileDistribution_,
+                                                           LinearBottomDims_>& tile_window,
+                                  bool_constant<oob_conditional_check> = {},
+                                  bool_constant<pre_nop>               = {})
+{
+    tile_window.load_raw(
+        tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 
 template <typename LdsTileWindow_,
@@ -66,7 +101,26 @@ async_load_tile_raw(LdsTileWindow_&& lds_tile,
                     bool_constant<pre_nop>               = {})
 {
     return tile_window.async_load_raw(
-        lds_tile, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+        lds_tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+}
+
+template <typename LdsTileWindow_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile,
+                                        const tile_window_linear<BottomTensorView_,
+                                                                 WindowLengths_,
+                                                                 TileDistribution_,
+                                                                 LinearBottomDims_>& tile_window,
+                                        bool_constant<oob_conditional_check> = {},
+                                        bool_constant<pre_nop>               = {})
+{
+    return tile_window.async_load_raw(
+        lds_tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 
 CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0)
diff --git a/include/ck_tile/core/tensor/shuffle_tile.hpp b/include/ck_tile/core/tensor/shuffle_tile.hpp
index baf009add..da3c7117e 100644
--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -109,7 +109,7 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
 
         // get input vectors
         static_for<0, num_vec_in, 1>{}([&](auto i) {
-            constexpr auto idx_y_in = generate_array(
+            constexpr auto idx_y_in = generate_tuple(
                 [&](auto ii) {
                     return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
                 },
diff --git a/include/ck_tile/core/tensor/store_tile.hpp b/include/ck_tile/core/tensor/store_tile.hpp
index 2efc65701..d5a716664 100644
--- a/include/ck_tile/core/tensor/store_tile.hpp
+++ b/include/ck_tile/core/tensor/store_tile.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
@@ -72,7 +73,7 @@ store_tile(tile_window_with_static_distribution<BottomTensorView_,
                                                 NumCoord>& tile_window,
            const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
 {
-    tile_window.store(dstr_tensor);
+    tile_window.store(dstr_tensor, number<-1>{});
 }
 
 template <typename BottomTensorView_,
@@ -87,7 +88,33 @@ store_tile_raw(tile_window_with_static_distribution<BottomTensorView_,
                                                     NumCoord>& tile_window,
                const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
 {
-    tile_window.store_raw(dstr_tensor);
+    tile_window.store_raw(dstr_tensor, number<-1>{});
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          typename DataType_>
+CK_TILE_DEVICE void store_tile(
+    tile_window_linear<BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_>&
+        tile_window,
+    const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    tile_window.store(dstr_tensor, number<-1>{});
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          typename DataType_>
+CK_TILE_DEVICE void store_tile_raw(
+    tile_window_linear<BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_>&
+        tile_window,
+    const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    tile_window.store_raw(dstr_tensor, number<-1>{});
 }
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 4655eec24..698ce5378 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -16,6 +16,24 @@
 
 namespace ck_tile {
 
+/*
+ * tensor_view
+ * abstract the underneath memory buffer(global, LDS, etc...)
+ * and provide a unified get/set function for access
+ *
+ * For addressing into the buffer we use 2 variable to control:
+ * coord : ND tensor coordinate, will calculate the actual offset inside
+ * linear_offset : 1D offset, will be used in the immediate field of
+ *   the buffer instruction to help reduce register usage
+ *
+ * User can use either of the field, or both to indexing into the tensor
+ *
+ * We usually provide 2 set of API for buffer get/set, e.g.
+ * get_vectorized_elements()/get_vectorized_elements_raw()
+ * the former usually will call intrinsic or normal C function, the later
+ * usually will call inline-asm function
+ *
+ */
 template <typename BufferView_,
           typename TensorDesc_,
           memory_operation_enum DstInMemOp_ = memory_operation_enum::set>
@@ -49,22 +67,6 @@ struct tensor_view
 
     CK_TILE_HOST_DEVICE constexpr auto& get_buffer_view() { return buf_; }
 
-#if 0
-    CK_TILE_HOST_DEVICE constexpr DataType get_element(const TensorCoord& coord) const
-    {
-        return buf_.template get<DataType>(
-            coord.get_offset(),
-            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord));
-    }
-
-    CK_TILE_HOST_DEVICE constexpr void set_element(const TensorCoord& coord, const DataType& x)
-    {
-        buf_.template set<DataType>(
-            coord.get_offset(),
-            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
-            x);
-    }
-#endif
     // X is vector of DataType.
     // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
     template <typename X,
@@ -75,14 +77,34 @@ struct tensor_view
                   bool>::type = false>
     CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
     get_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
                             bool_constant<oob_conditional_check> = {}) const
     {
         return buf_.template get<X>(
             coord.get_offset(),
+            linear_offset,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             bool_constant<oob_conditional_check>{});
     }
 
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
+    get_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            bool is_valid_element, // flag
+                            bool_constant<oob_conditional_check> = {}) const
+    {
+        return buf_.template get<X>(coord.get_offset(),
+                                    linear_offset,
+                                    is_valid_element,
+                                    bool_constant<oob_conditional_check>{});
+    }
+
     // X is vector of DataType.
     // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
     template <typename X,
@@ -94,12 +116,90 @@ struct tensor_view
                   bool>::type = false>
     CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t<X>& dst,
                                                          const TensorCoord& coord,
+                                                         index_t linear_offset,
                                                          bool_constant<oob_conditional_check> = {},
                                                          bool_constant<pre_nop> = {}) const
     {
         return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
             dst,
             coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            bool_constant<pre_nop>{});
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t<X>& dst,
+                                                         const TensorCoord& coord,
+                                                         index_t linear_offset,
+                                                         bool is_valid_element,
+                                                         bool_constant<oob_conditional_check> = {},
+                                                         bool_constant<pre_nop> = {}) const
+    {
+        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
+            dst, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
+                                  const TensorCoord& coord,
+                                  index_t linear_offset) const
+    {
+        return buf_.template async_get<X>(
+            smem,
+            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            bool_constant<oob_conditional_check>{});
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
+                                  const TensorCoord& coord,
+                                  index_t linear_offset,
+                                  bool is_valid_element) const
+    {
+        return buf_.template async_get<X>(smem,
+                                          coord.get_offset(),
+                                          linear_offset,
+                                          is_valid_element,
+                                          bool_constant<oob_conditional_check>{});
+    }
+
+    template <typename X,
+              bool pre_nop = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements_raw(remove_cvref_t<DataType>* smem,
+                                      const TensorCoord& coord,
+                                      index_t linear_offset,
+                                      bool_constant<pre_nop> = {}) const
+    {
+        return buf_.template async_get_raw<X>(
+            smem,
+            coord.get_offset(),
+            linear_offset,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             bool_constant<pre_nop>{});
     }
@@ -110,11 +210,15 @@ struct tensor_view
                   std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                  typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                   bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void async_get_vectorized_elements_raw(
-        remove_cvref_t<DataType>* smem, const TensorCoord& coord, bool_constant<pre_nop> = {}) const
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements_raw(remove_cvref_t<DataType>* smem,
+                                      const TensorCoord& coord,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<pre_nop> = {}) const
     {
         return buf_.template async_get_raw<X>(
-            smem, coord.get_offset(), true /*not used*/, bool_constant<pre_nop>{});
+            smem, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
     }
 
     // X is vector of DataType.
@@ -125,11 +229,15 @@ struct tensor_view
                   std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                  typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                   bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements(
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            const X& x,
+                            bool_constant<oob_conditional_check> = {})
     {
         buf_.template set<X, oob_conditional_check>(
             coord.get_offset(),
+            linear_offset,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             x);
     }
@@ -140,15 +248,53 @@ struct tensor_view
                   std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                  typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                   bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements_raw(
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            bool is_valid_element,
+                            const X& x,
+                            bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template set<X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements_raw(const TensorCoord& coord,
+                                index_t linear_offset,
+                                const X& x,
+                                bool_constant<oob_conditional_check> = {})
     {
         buf_.template set_raw<X, oob_conditional_check>(
             coord.get_offset(),
+            linear_offset,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             x);
     }
 
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements_raw(const TensorCoord& coord,
+                                index_t linear_offset,
+                                bool is_valid_element,
+                                const X& x,
+                                bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template set_raw<X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
     // X is vector of DataType.
     // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
     template <typename X,
@@ -157,15 +303,36 @@ struct tensor_view
                   std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                  typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                   bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void update_vectorized_elements(
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements(const TensorCoord& coord,
+                               index_t linear_offset,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
     {
         buf_.template update<DstInMemOp, X, oob_conditional_check>(
             coord.get_offset(),
+            linear_offset,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             x);
     }
 
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements(const TensorCoord& coord,
+                               index_t linear_offset,
+                               bool is_valid_element,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template update<DstInMemOp, X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
     CK_TILE_HOST_DEVICE void print() const
     {
         printf("tensor_view{");
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 266d623c7..ca3507827 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -18,6 +18,8 @@
 
 namespace ck_tile {
 
+// Note: this tile window do not support single issue
+// you need to use tile_window_linear structure for this purpose
 template <typename BottomTensorView_,
           typename WindowLengths_,
           typename StaticTileDistribution_,
@@ -41,6 +43,7 @@ struct tile_window_with_static_distribution
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
+    static_assert(NumCoord == 1);
 
     // TODO: check WindowLengths and StaticTileDistribution are consistent
 
@@ -189,7 +192,8 @@ struct tile_window_with_static_distribution
             constexpr auto idx_diff_ys =
                 SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
 
-            constexpr auto idx_diff_ps_ys = container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
 
             move_window_adaptor_and_bottom_tensor_thread_coordinate(
                 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -222,10 +226,11 @@ struct tile_window_with_static_distribution
 
     // move thread's window adaptor coordinate and bottom tensor coordinate
     // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
     CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
         WindowAdaptorCoord& window_adaptor_thread_coord,
         BottomTensorCoord& bottom_tensor_thread_coord,
-        const AdaptorTopIndex& idx_diff_adaptor_top) const
+        const ATopIndex& idx_diff_adaptor_top) const
     {
         array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
 
@@ -279,10 +284,11 @@ struct tile_window_with_static_distribution
                           get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
     }
 
-    CK_TILE_DEVICE constexpr auto get_num_access() const { return load_store_traits::NumAccess; }
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return load_store_traits::NumAccess; }
 
-    template <bool oob_conditional_check = true>
-    CK_TILE_DEVICE auto load(bool_constant<oob_conditional_check> = {}) const
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
     {
         using Traits = load_store_traits;
 
@@ -308,11 +314,11 @@ struct tile_window_with_static_distribution
                 // read from bottom tensor
                 const vector_t vec_value =
                     get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
-                        bottom_tensor_thread_coord, bool_constant<oob_conditional_check>{});
+                        bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
 #if 1
                 // write into distributed tensor
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
@@ -338,8 +344,9 @@ struct tile_window_with_static_distribution
                 {
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
 
                     move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -350,8 +357,12 @@ struct tile_window_with_static_distribution
         return dst_tensor;
     }
 
-    template <typename DstTile, bool oob_conditional_check = true, bool pre_nop = false>
+    template <typename DstTile,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              bool pre_nop                = false>
     CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
+                                 number<i_access_unsupport_>          = {},
                                  bool_constant<oob_conditional_check> = {},
                                  bool_constant<pre_nop>               = {}) const
     {
@@ -397,6 +408,7 @@ struct tile_window_with_static_distribution
                 get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
                     dst_vec_tbuf.template at<d / Traits::ScalarPerVector>(),
                     bottom_tensor_thread_coord,
+                    0 /**/,
                     bool_constant<oob_conditional_check>{},
                     pre_nop_);
 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
@@ -409,23 +421,24 @@ struct tile_window_with_static_distribution
                 {
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
 
                     move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                 }
             });
         });
-#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
-        asm volatile("; this inline asm is workaround to prevent compiler from using too much "
-                     "scratch memory" ::);
-#endif
     }
 
     // TODO: currently async load only implemented in inline asm
-    template <typename LdsTileWindow_, bool oob_conditional_check = true, bool pre_nop = false>
+    template <typename LdsTileWindow_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              bool pre_nop                = false>
     CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+                                       number<i_access_unsupport_>          = {},
                                        bool_constant<oob_conditional_check> = {},
                                        bool_constant<pre_nop>               = {}) const
     {
@@ -467,7 +480,7 @@ struct tile_window_with_static_distribution
 
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            // TODO: use structure binding (to be captured later) if compiled in C++20
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
             auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
             auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
 
@@ -482,15 +495,16 @@ struct tile_window_with_static_distribution
 
                 // read from bottom tensor
                 get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
-                    smem, bottom_tensor_thread_coord, pre_nop_);
+                    smem, bottom_tensor_thread_coord, 0, pre_nop_);
 
                 // move thread coordinate
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                 {
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
 
                     move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -501,8 +515,81 @@ struct tile_window_with_static_distribution
         });
     }
 
-    template <bool oob_conditional_check = true>
+    template <typename LdsTileWindow_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
+                                   number<i_access_unsupport_>          = {},
+                                   bool_constant<oob_conditional_check> = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
+        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
+        // check?)
+        constexpr index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{}));
+
+        constexpr index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
+            size_per_buf;
+
+        constexpr index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
+            size_per_buf;
+
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+
+        using Traits = load_store_traits;
+
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        // TODO: we force CK_TILE_LDS_ADDR
+        CK_TILE_LDS_ADDR LdsDataType* smem =
+            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // read from bottom tensor
+                get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                    smem, bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
+
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+
+                    smem += size_per_issue; // Note we manually increase the per-issue offset
+                }
+            });
+        });
+    }
+
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
     CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                              number<i_access_unsupport_>          = {},
                               bool_constant<oob_conditional_check> = {}) const
     {
         using Traits = load_store_traits;
@@ -515,7 +602,6 @@ struct tile_window_with_static_distribution
 
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            /// TODO: use structure binding (to be captured later) if compiled in C++20
             auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
             auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
 
@@ -530,7 +616,7 @@ struct tile_window_with_static_distribution
                 vector_t vec_value;
 
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
@@ -548,15 +634,19 @@ struct tile_window_with_static_distribution
 
                 // write into bottom tensor
                 get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
-                    bottom_tensor_thread_coord, vec_value, bool_constant<oob_conditional_check>{});
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{});
 
                 // move thread coordinate
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                 {
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
 
                     move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -565,8 +655,9 @@ struct tile_window_with_static_distribution
         });
     }
 
-    CK_TILE_DEVICE void
-    store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor) const
+    template <index_t i_access_unsupport_ = -1>
+    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                  number<i_access_unsupport_> = {}) const
     {
         using Traits = load_store_traits;
 
@@ -591,7 +682,7 @@ struct tile_window_with_static_distribution
                 // read from distributed tensor
                 vector_t vec_value;
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
@@ -606,15 +697,16 @@ struct tile_window_with_static_distribution
                 // write into bottom tensor
                 get_bottom_tensor_view()
                     .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
-                        bottom_tensor_thread_coord, vec_value);
+                        bottom_tensor_thread_coord, 0, vec_value);
 
                 // move thread coordinate
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                 {
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
 
                     move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -623,8 +715,9 @@ struct tile_window_with_static_distribution
         });
     }
 
-    template <bool oob_conditional_check = true>
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
     CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                               number<i_access_unsupport_>          = {},
                                bool_constant<oob_conditional_check> = {}) const
     {
         using Traits = load_store_traits;
@@ -650,7 +743,7 @@ struct tile_window_with_static_distribution
                 vector_t vec_value;
 
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
@@ -666,15 +759,19 @@ struct tile_window_with_static_distribution
 
                 // write into bottom tensor
                 get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
-                    bottom_tensor_thread_coord, vec_value, bool_constant<oob_conditional_check>{});
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{});
 
                 // move thread coordinate
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                 {
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
 
                     move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -746,7 +843,8 @@ struct tile_window_with_static_distribution
             constexpr auto idx_diff_ys =
                 SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
 
-            constexpr auto idx_diff_ps_ys = container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
 
             move_window_adaptor_and_bottom_tensor_thread_coordinate(
                 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -798,6 +896,27 @@ make_tile_window(const TensorView_& tensor_view,
         tensor_view, window_lengths, origin, tile_distribution};
 }
 
+// this version can't be called in a constexpr context
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          index_t NumCoord = 1>
+CK_TILE_DEVICE auto
+make_tile_window_raw(const TensorView_& tensor_view,
+                     const WindowLengths_& window_lengths,
+                     const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                     const StaticTileDistribution_& tile_distribution,
+                     number<NumCoord> = {})
+{
+    auto w = tile_window_with_static_distribution<remove_cvref_t<TensorView_>,
+                                                  remove_cvref_t<WindowLengths_>,
+                                                  remove_cvref_t<StaticTileDistribution_>,
+                                                  NumCoord>{
+        tensor_view, window_lengths, origin, tile_distribution};
+    w.init_raw();
+    return w;
+}
+
 template <typename TensorView_,
           typename WindowLengths_,
           typename StaticTileDistribution_,
@@ -922,6 +1041,19 @@ make_tile_window(const tile_window_with_static_lengths<TensorView, WindowLengths
                             tile_distribution);
 }
 
+template <typename TensorView, typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_raw(const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+                     const StaticTileDistribution& tile_distribution)
+{
+    auto w = make_tile_window(tile_window.get_bottom_tensor_view(),
+                              tile_window.get_window_lengths(),
+                              tile_window.get_window_origin(),
+                              tile_distribution);
+    w.init_raw();
+    return w;
+}
+
 template <typename TensorView_, typename WindowLengths_>
 CK_TILE_DEVICE void move_tile_window(
     tile_window_with_static_lengths<TensorView_, WindowLengths_>& window,
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
new file mode 100644
index 000000000..4b921ec5b
--- /dev/null
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -0,0 +1,1082 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+#define WINDOW_DISPATCH_ISSUE()                                     \
+    if constexpr(i_access < 0)                                      \
+    {                                                               \
+        static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \
+    }                                                               \
+    else                                                            \
+    {                                                               \
+        static_assert(i_access < NumAccess);                        \
+        issue(number<i_access>{});                                  \
+    }
+
+//
+// This version of tile window will pre-cache offset/flags based on need
+//
+// LinearBottomDims_, e.g seq<0, 1> for 2d tensor, the last one is linear dim
+// so last dim can use immediate offset to indexing, can save register
+// TODO: if using this struct, better use load_raw()/store_raw(), can control
+//       the the immediate offset on the fly
+// space-filing-curve is non-snaked here!
+//
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_>
+struct tile_window_linear
+{
+    using BottomTensorView = remove_reference_t<BottomTensorView_>;
+    using WindowLengths    = remove_cvref_t<WindowLengths_>;
+    using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
+
+    using WindowAdaptor    = typename TileDstr::PsYs2XsAdaptor;
+    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
+
+    using DataType         = remove_cvref_t<typename BottomTensorView::DataType>;
+    using LinearBottomDims = remove_cvref_t<LinearBottomDims_>;
+
+    static_assert(LinearBottomDims::size() == BottomTensorView::get_num_of_dimension());
+
+    static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension();
+    static constexpr index_t NDimBottomTensor     = BottomTensorDesc::get_num_of_dimension();
+
+    static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p();
+    static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y();
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    // TODO: check WindowLengths and StaticTileDistribution are consistent
+
+    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
+                  "wrong! lengths should be static");
+    static_assert(TileDstr::is_static(), "wrong!");
+
+    static_assert(NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
+                  "wrong! inconsistent # of diemsnions");
+
+    using AdaptorTopIndex   = array<index_t, NDimWindowAdaptorTop>;
+    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
+
+    using WindowAdaptorCoord =
+        decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{}));
+
+    using BottomTensorCoord =
+        decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{}));
+
+    struct traits
+    {
+        private:
+        // return vector dimension among [y0, y1, ...]
+        CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides()
+        {
+            // bottom tensor top dimension vector lengths and strides
+            const auto [bottom_tensor_top_dim_vector_lengths,
+                        bottom_tensor_top_dim_vector_strides] =
+                BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
+
+            // window vector lengths/strides
+            const auto window_adaptor_bottom_dim_vector_lengths =
+                bottom_tensor_top_dim_vector_lengths;
+            const auto window_adaptor_bottom_dim_vector_strides =
+                bottom_tensor_top_dim_vector_strides;
+
+            // window adaptor [p0, p1, ..., y0, y1, ...]
+            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
+                window_adaptor_vector_lengths{-1};
+            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
+                window_adaptor_vector_strides{-1};
+
+            constexpr auto window_adaptor_bottom_dims =
+                WindowAdaptor::get_bottom_dimension_hidden_ids();
+
+            set_container_subset(window_adaptor_vector_lengths,
+                                 window_adaptor_bottom_dims,
+                                 window_adaptor_bottom_dim_vector_lengths);
+            set_container_subset(window_adaptor_vector_strides,
+                                 window_adaptor_bottom_dims,
+                                 window_adaptor_bottom_dim_vector_strides);
+
+            const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
+                WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
+                    window_adaptor_vector_lengths, window_adaptor_vector_strides);
+
+            // [y0, y1, ...]
+            constexpr auto y_dims =
+                typename arithmetic_sequence_gen<TileDstr::get_num_of_dimension_p(),
+                                                 NDimWindowAdaptorTop,
+                                                 1>::type{};
+
+            return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims),
+                              get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
+        }
+
+        static constexpr auto get_vector_dim_y_scalar_per_vector()
+        {
+            const auto [ys_vector_lengths, ys_vector_strides] =
+                get_window_adaptor_ys_safe_vector_length_strides();
+
+            index_t VectorDimY_      = 0;
+            index_t ScalarPerVector_ = 1;
+
+            for(index_t i = 0; i < NDimY; ++i)
+            {
+                if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
+                {
+                    ScalarPerVector_ = ys_vector_lengths[i];
+                    VectorDimY_      = i;
+                }
+            }
+
+            return make_tuple(VectorDimY_, ScalarPerVector_);
+        }
+
+        public:
+        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
+        static constexpr index_t ScalarPerVector =
+            get_vector_dim_y_scalar_per_vector().template at<1>();
+
+        using vector_t = thread_buffer<DataType, ScalarPerVector>;
+
+        private:
+        static constexpr auto scalars_per_access_ = [] {
+            constexpr auto scalars_per_access_arr = generate_array(
+                [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number<NDimY>{});
+
+            /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE()
+            constexpr auto NDimY_ = NDimY;
+
+            return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
+        }();
+
+        static constexpr auto get_space_filling_curve()
+        {
+            constexpr auto thread_tensor_lengths_ys =
+                to_sequence(TileDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+            // FIXME: need logic to judge dim access order
+            using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type;
+
+            return space_filling_curve<decltype(thread_tensor_lengths_ys),
+                                       DimAccessOrder,
+                                       decltype(scalars_per_access_),
+                                       false /*!!! no snaked curve! */>{};
+        }
+
+        public:
+        using SFC_Ys = decltype(get_space_filling_curve());
+
+        static constexpr index_t NumAccess = SFC_Ys::get_num_of_access();
+
+        static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0");
+
+        private:
+        static constexpr auto get_num_non_linear_access()
+        {
+            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
+            using ys_to_rhs_major =
+                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+
+            constexpr auto non_linear = [&]() {
+                index_t cnt = 1;
+                static_for<0, NDimY, 1>{}([&](auto i_dim_y) {
+                    constexpr auto rhs_major    = ys_to_rhs_major{}[i_dim_y];
+                    constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
+                    if constexpr(LinearBottomDims{}[target_h_dim] == 0)
+                    {
+                        cnt *= sfc_access_lens[i_dim_y];
+                    }
+                });
+                return cnt;
+            }();
+
+            return non_linear;
+        }
+
+        // example:
+        // non_linear_access_map: sequence<0, 0, 0, 0, 1, 1, 1, 1> for 8 access, totally 2 register
+        // used
+        //  -> histogram : sequence<4, 4>
+        //  -> prefixsum : seqneuce<0, 4, 8>
+        // non_linear_access_map: sequence<0, 1, 2, 3, 4, 5, 6, 7> for 8 access, totally 8 register
+        // used, will pre-cache 8
+        //  -> histogram : sequence<1, 1, 1, 1, 1, 1, 1, 1>
+        //  -> prefixsum : seqneuce<0, 1, 2, 3, 4, 5, 6, 7, 8>
+        // non_linear_access_map: sequence<0, 0, 1, 1, 2, 2, 3, 3> for 8 access, totally 4 register
+        // used, will pre-cache 4
+        //  -> histogram : sequence<2, 2, 2, 2>
+        //  -> prefixsum : seqneuce<0, 2, 4, 6, 8>
+        static constexpr auto get_non_linear_access_map()
+        {
+            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
+            using ys_to_rhs_major =
+                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            constexpr auto non_linear_map = [&]() {
+                array<index_t, NumAccess> m_{0};
+                index_t cumulative_len_            = 1;
+                index_t cumulative_non_linear_len_ = 1;
+                static_for<0, NDimY, 1>{}([&](auto i_y) {
+                    constexpr auto i_dim_y       = number<NDimY - i_y - 1>{}; // from right to left
+                    constexpr auto rhs_major     = ys_to_rhs_major{}[i_dim_y];
+                    constexpr auto target_h_dim  = number<rhs_major - 1>{}; // no r dim here!
+                    constexpr auto is_linear_dim = LinearBottomDims{}[target_h_dim];
+
+                    array<index_t, NumAccess> current_m_{0};
+                    constexpr auto current_len_ = sfc_access_lens[i_dim_y];
+
+                    // copy cumulative length as current pattern
+                    for(auto i_ = 0; i_ < cumulative_len_; i_++)
+                    {
+                        current_m_(i_) = m_[i_];
+                    }
+                    for(auto j_ = 0; j_ < current_len_; j_++)
+                    {
+                        auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_;
+                        for(auto i_ = 0; i_ < cumulative_len_; i_++)
+                        {
+                            m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_;
+                        }
+                    }
+                    cumulative_len_ *= current_len_;
+                    if(!is_linear_dim)
+                        cumulative_non_linear_len_ *= current_len_;
+                });
+                return m_;
+            }();
+
+            return TO_SEQUENCE(non_linear_map, NumAccess);
+        }
+
+        static constexpr auto get_non_linear_access_histogram()
+        {
+            constexpr auto m_ = get_non_linear_access_map();
+            // m_.foo();
+
+            constexpr auto r_ =
+                typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{};
+
+            constexpr auto h_ = histogram_sorted_sequence(m_, r_);
+
+            return h_;
+        }
+
+        static constexpr auto get_non_linear_access_histogram_prefix_sum()
+        {
+            constexpr auto h_            = get_non_linear_access_histogram();
+            constexpr auto h_prefix_sum_ = prefix_sum_sequence(h_);
+            return h_prefix_sum_;
+        }
+
+        public:
+        static constexpr index_t NumAccess_NonLinear = get_num_non_linear_access();
+        using AccessMap_NonLinear       = decltype(get_non_linear_access_map()); // sequence
+        using AccessHistogram_NonLinear = decltype(get_non_linear_access_histogram());
+        using AccessPrefixSum_NonLinear = decltype(get_non_linear_access_histogram_prefix_sum());
+    };
+
+    static constexpr index_t NumAccess           = traits::NumAccess;
+    static constexpr index_t NumAccess_NonLinear = traits::NumAccess_NonLinear;
+    using AccessMap_NonLinear                    = typename traits::AccessMap_NonLinear;
+    using AccessHistogram_NonLinear              = typename traits::AccessHistogram_NonLinear;
+    using AccessPrefixSum_NonLinear              = typename traits::AccessPrefixSum_NonLinear;
+
+    CK_TILE_DEVICE constexpr tile_window_linear() = default;
+
+    CK_TILE_DEVICE constexpr tile_window_linear(const BottomTensorView& bottom_tensor_view,
+                                                const WindowLengths& window_lengths,
+                                                const BottomTensorIndex& window_origin,
+                                                const TileDstr& tile_distribution)
+        : bottom_tensor_view_{bottom_tensor_view},
+          window_lengths_{window_lengths},
+          window_origin_{window_origin},
+          tile_dstr_{tile_distribution},
+          cached_coords_{},
+          cached_flags_{}
+    {
+        auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            tile_distribution.get_ps_ys_to_xs_adaptor(),
+            container_concat(make_tuple(get_warp_id(), get_lane_id()),
+                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
+
+        auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+
+        // future load/store() calls (might allocate more registers)
+        using SFC_Ys = typename traits::SFC_Ys;
+
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_save_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+
+            if constexpr(need_save_non_linear_coord)
+            {
+                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+            }
+
+            // TODO: need pad_tensor_view to check which dim need use flag to check
+            //      cached flag is independent from non-linear-coord
+            //      but need be updated in move_tile, with proper dims
+            cached_flags_(i_access) = coordinate_has_valid_offset_assuming_top_index_is_valid(
+                bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp);
+
+            if constexpr(i_access != (NumAccess - 1))
+            {
+                constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
+                constexpr auto idx_diff_ps_ys = container_concat(
+                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    idx_diff_ys);
+
+                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    window_adaptor_thread_coord_tmp,
+                    bottom_tensor_thread_coord_tmp,
+                    idx_diff_ps_ys);
+            }
+        });
+    }
+
+    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
+
+    CK_TILE_DEVICE static constexpr bool has_static_tile_distribution()
+    {
+        return TileDstr::is_static();
+    }
+
+    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
+
+    CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; }
+
+    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
+
+    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
+
+    CK_TILE_DEVICE constexpr void
+    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
+    {
+        bottom_tensor_view_.buf_.p_data_ = data;
+    }
+
+    // move thread's window adaptor coordinate and bottom tensor coordinate
+    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
+    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
+        WindowAdaptorCoord& window_adaptor_thread_coord,
+        BottomTensorCoord& bottom_tensor_thread_coord,
+        const ATopIndex& idx_diff_adaptor_top) const
+    {
+        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
+
+        move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
+                                       window_adaptor_thread_coord,
+                                       idx_diff_adaptor_top,
+                                       idx_diff_adaptor_bottom);
+
+        move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                               bottom_tensor_thread_coord,
+                               idx_diff_adaptor_bottom);
+    }
+
+    template <index_t i_access>
+    CK_TILE_DEVICE static constexpr auto get_bottom_linear_coordinate(number<i_access>)
+    {
+        using SFC_Ys          = typename traits::SFC_Ys;
+        constexpr auto idx_ys = SFC_Ys::get_index(number<i_access>{});
+        using ys_to_rhs_major =
+            typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+
+        constexpr auto modified_idx_ys = generate_tuple(
+            [&](auto i_dim_y) {
+                constexpr auto rhs_major    = ys_to_rhs_major{}[i_dim_y];
+                constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
+                if constexpr(LinearBottomDims{}[target_h_dim] == 0)
+                {
+                    return number<0>{};
+                }
+                else
+                {
+                    return number<idx_ys[i_dim_y]>{};
+                }
+            },
+            number<NDimY>{});
+
+        constexpr auto adaptor_ = TileDstr{}.get_ps_ys_to_xs_adaptor();
+        constexpr auto idx_ =
+            container_concat(make_tuple(number<0>{}, number<0>{}), modified_idx_ys);
+
+        return adaptor_.calculate_bottom_index(idx_);
+    }
+
+    template <index_t i_access>
+    CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number<i_access>)
+    {
+        constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
+        // since this is linear offset, we assum bottom X tensor is always linear
+        constexpr index_t linear_offset = [&]() {
+            constexpr auto x_idx_ = linear_coord;
+            constexpr auto x_len_ = TileDstr{}.get_lengths();
+            static_assert(x_idx_.size() == x_len_.size());
+            constexpr index_t x_dims_ = x_idx_.size();
+            index_t cu_stride_        = 1;
+            index_t cu_offset_        = 0;
+            static_for<0, x_dims_, 1>{}([&](auto i_) {
+                auto r_i_ = number<x_dims_ - i_ - 1>{};
+                cu_offset_ += x_idx_[r_i_] * cu_stride_;
+                cu_stride_ *= x_len_[r_i_];
+            });
+            return cu_offset_;
+        }();
+
+        return linear_offset;
+    }
+
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }
+
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(number<i_access> = {}, bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
+
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess = number<i_access_>{};
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
+
+            // read from bottom tensor
+            const vector_t vec_value =
+                get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord,
+                    linear_offset,
+                    bottom_tensor_flag,
+                    bool_constant<oob_conditional_check>{});
+#if 1
+            // data index [y0, y1, ...]
+            constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
+            // write into distributed tensor
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value.template get_as<DataType>()[j];
+            });
+#else
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+
+            dst_tensor.get_thread_buffer().template get_as<vector_t>()(
+                number<d / traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
+#endif
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+
+        return dst_tensor;
+    }
+
+    template <typename DstTile,
+              index_t i_access           = -1,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false>
+    CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
+                                 number<i_access> = {}, // negative means loop over all num_access
+                                 bool_constant<oob_conditional_check> = {},
+                                 bool_constant<pre_nop>               = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+        static constexpr index_t YElementSize =
+            TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
+        static_assert(YElementSize % traits::ScalarPerVector == 0);
+        using vectorized_tbuf = array<vector_t, YElementSize / traits::ScalarPerVector>;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        auto& dst_vec_tbuf = reinterpret_cast<vectorized_tbuf&>(dst_tensor.get_thread_buffer());
+
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess  = number<i_access_>{};
+            constexpr auto pre_nop_ = [&]() {
+                if constexpr(pre_nop && i_access_ == 0 &&
+                             BottomTensorView::buffer_view::get_address_space() ==
+                                 address_space_enum::global)
+                    return bool_constant<true>{};
+                else
+                    return bool_constant<false>{};
+            }();
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+
+            get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
+                dst_vec_tbuf.template at<d / traits::ScalarPerVector>(),
+                bottom_tensor_thread_coord,
+                linear_offset /**/,
+                bottom_tensor_flag,
+                bool_constant<oob_conditional_check>{},
+                pre_nop_);
+#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
+    CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
+            asm volatile(""); // this is starting from rocm-6.2, but same sympton, reuse this flag
+#endif
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    // TODO: currently async load only implemented in inline asm
+    template <typename LdsTileWindow_,
+              index_t i_access           = -1,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false>
+    CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+                                       number<i_access>                     = {},
+                                       bool_constant<oob_conditional_check> = {},
+                                       bool_constant<pre_nop>               = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+
+        // currently we only support everything is non linear dim
+        // actually it's not performant if we have linear dim(e.g. fast changing)
+        static_assert(NumAccess_NonLinear == NumAccess);
+        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+                      address_space_enum::global);
+
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+        const index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{})) *
+            sizeof(LdsDataType);
+
+        const index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+
+        const index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+        m0_set_with_memory(m0_init_value); // This should be wave independent
+
+        using vector_t = typename traits::vector_t;
+
+        LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess  = number<i_access_>{};
+            constexpr auto pre_nop_ = [&]() {
+                if constexpr(pre_nop && i_access_ == 0)
+                    return bool_constant<true>{};
+                else
+                    return bool_constant<false>{};
+            }();
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess]; // get this flag anyway
+
+            // read from bottom tensor
+            get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+                smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
+
+            // move thread coordinate
+            if constexpr(i_access_ != (NumAccess - 1))
+            {
+                m0_inc_with_memory(size_per_issue);
+            }
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <typename LdsTileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
+                                   number<i_access>                     = {},
+                                   bool_constant<oob_conditional_check> = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+
+        // currently we only support everything is non linear dim
+        // actually it's not performant if we have linear dim(e.g. fast changing)
+        static_assert(NumAccess_NonLinear == NumAccess);
+        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+                      address_space_enum::global);
+
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
+        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
+        // check?)
+        constexpr index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{}));
+
+        constexpr index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
+            size_per_buf;
+
+        constexpr index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
+            size_per_buf;
+
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+
+        using vector_t = typename traits::vector_t;
+
+        // TODO: we force CK_TILE_LDS_ADDR
+        CK_TILE_LDS_ADDR LdsDataType* smem =
+            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // read from bottom tensor
+            get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                smem,
+                bottom_tensor_thread_coord,
+                0,
+                bottom_tensor_flag,
+                bool_constant<oob_conditional_check>{});
+
+            // move thread coordinate
+            if constexpr(i_access_ != (NumAccess - 1))
+            {
+                smem += size_per_issue; // Note we manually increase the per-issue offset
+            }
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                              number<i_access>                     = {},
+                              bool_constant<oob_conditional_check> = {}) const
+    {
+
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{});
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <index_t i_access = -1>
+    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                  number<i_access> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr                    = TileDstr{};
+        static constexpr bool oob_conditional_check = true;
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view()
+                .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
+                    bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                               number<i_access>                     = {},
+                               bool_constant<oob_conditional_check> = {}) const
+    {
+
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{});
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    // move thread's botom tensor coordiante
+    // [x0', x1', ... ] ==> [offset]
+    // also move window-origin
+    CK_TILE_DEVICE void move(const BottomTensorIndex& step)
+    {
+        window_origin_ += step;
+
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto IAccess       = number<i_access>{};
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_update_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+
+            if constexpr(need_update_non_linear_coord)
+            {
+                move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                                       cached_coords_(non_linear_id),
+                                       step);
+            }
+
+            // move the current coord with linear_coords
+            auto tmp_coords             = cached_coords_[non_linear_id];
+            constexpr auto linear_coord = get_bottom_linear_coordinate(IAccess);
+            move_tensor_coordinate(
+                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord);
+
+            cached_flags_(IAccess) = coordinate_has_valid_offset_assuming_top_index_is_valid(
+                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords);
+        });
+    }
+
+    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
+    {
+        window_origin_ = new_window_origin;
+
+        auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            TileDstr{}.get_ps_ys_to_xs_adaptor(),
+            container_concat(make_tuple(get_warp_id(), get_lane_id()),
+                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
+
+        auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+
+        // future load/store() calls (might allocate more registers)
+        using SFC_Ys = typename traits::SFC_Ys;
+
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_save_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+
+            if constexpr(need_save_non_linear_coord)
+            {
+                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+            }
+
+            if constexpr(i_access != (NumAccess - 1))
+            {
+                constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
+                constexpr auto idx_diff_ps_ys = container_concat(
+                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    idx_diff_ys);
+
+                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    window_adaptor_thread_coord_tmp,
+                    bottom_tensor_thread_coord_tmp,
+                    idx_diff_ps_ys);
+            }
+        });
+    }
+
+    CK_TILE_HOST_DEVICE void init_raw() { bottom_tensor_view_.init_raw(); }
+
+    // this is the bottom tensor view
+    // [x0', x1', ...] ==> [offset]
+    BottomTensorView bottom_tensor_view_;
+
+    //
+    WindowLengths window_lengths_;
+
+    // origin ([x0', x1', ...]) of window on bottom tensor
+    BottomTensorIndex window_origin_;
+
+    // Tile tensor distribution, which contains:
+    //   1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...]
+    //   2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d]
+    TileDstr tile_dstr_;
+
+    // this contains:
+    array<BottomTensorCoord, traits::NumAccess_NonLinear> cached_coords_;
+    array<bool, traits::NumAccess> cached_flags_;
+};
+
+#undef WINDOW_DISPATCH_ISSUE
+
+namespace impl {
+template <address_space_enum, index_t len_>
+struct default_linear_bottom_dims_impl
+{
+    using type = typename uniform_sequence_gen<len_, 0>::type;
+};
+
+template <index_t len_>
+struct default_linear_bottom_dims_impl<address_space_enum::global, len_>
+{
+    // global default to seq<0,0,....1>
+    using type = typename sequence_merge<typename uniform_sequence_gen<len_ - 1, 0>::type,
+                                         sequence<1>>::type;
+};
+
+template <index_t len_>
+struct default_linear_bottom_dims_impl<address_space_enum::lds, len_>
+{
+    // lds default to seq<1,1.....1>
+    using type = typename uniform_sequence_gen<len_, 1>::type;
+};
+} // namespace impl
+
+template <typename TensorView_>
+using default_linear_bottom_dims =
+    typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(),
+                                                   TensorView_::get_num_of_dimension()>::type;
+
+// if using this API, will create a tile_window_linear
+// this structure can have the chance to use immediate value, save register
+// need pass in LinearBottomDims_ properly to control which dim is linear
+// so to generate a constexpr offset as linear_offset for this dim
+// (and finally pass to the immediate offset of buffer/lds instruction)
+//
+// Note: there is no internal check for which dim is OK to use linear offset
+// user must make sure by themselves
+//
+// e.g.
+// 2d global matrix, set LinearBottomDims_=seq<0, 1>, the last dim will generate
+// immediate offset if each thread has multiple issue along last dim
+//
+// 2d LDS buffer, set LinearBottomDims_=seq<1, 1>, then only one vgpr used as offset
+// everything else is just using immediate offset.
+//
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear(const TensorView_& tensor_view,
+                        const WindowLengths_& window_lengths,
+                        const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                        const StaticTileDistribution_& tile_distribution,
+                        LinearBottomDims_ = {})
+{
+    static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
+    return tile_window_linear<remove_cvref_t<TensorView_>,
+                              remove_cvref_t<WindowLengths_>,
+                              remove_cvref_t<StaticTileDistribution_>,
+                              remove_cvref_t<LinearBottomDims_>>{
+        tensor_view, window_lengths, origin, tile_distribution};
+}
+
+template <
+    typename TileWindow_,
+    typename StaticTileDistribution_,
+    typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear(const TileWindow_& tile_window,
+                        const StaticTileDistribution_& tile_distribution,
+                        LinearBottomDims_ = {})
+{
+    return make_tile_window_linear(tile_window.get_bottom_tensor_view(),
+                                   tile_window.get_window_lengths(),
+                                   tile_window.get_window_origin(),
+                                   tile_distribution,
+                                   LinearBottomDims_{});
+}
+
+// this version must not be called under a constexpr context
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
+CK_TILE_DEVICE auto
+make_tile_window_linear_raw(const TensorView_& tensor_view,
+                            const WindowLengths_& window_lengths,
+                            const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                            const StaticTileDistribution_& tile_distribution,
+                            LinearBottomDims_ = {})
+{
+    static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
+    auto w = tile_window_linear<remove_cvref_t<TensorView_>,
+                                remove_cvref_t<WindowLengths_>,
+                                remove_cvref_t<StaticTileDistribution_>,
+                                remove_cvref_t<LinearBottomDims_>>{
+        tensor_view, window_lengths, origin, tile_distribution};
+    w.init_raw();
+    return w;
+}
+
+template <
+    typename TileWindow_,
+    typename StaticTileDistribution_,
+    typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear_raw(const TileWindow_& tile_window,
+                            const StaticTileDistribution_& tile_distribution,
+                            LinearBottomDims_ = {})
+{
+    return make_tile_window_linear_raw(tile_window.get_bottom_tensor_view(),
+                                       tile_window.get_window_lengths(),
+                                       tile_window.get_window_origin(),
+                                       tile_distribution,
+                                       LinearBottomDims_{});
+}
+
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_>
+CK_TILE_DEVICE void move_tile_window(
+    tile_window_linear<TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_>&
+        window,
+    const typename tile_window_linear<TensorView_,
+                                      WindowLengths_,
+                                      StaticTileDistribution_,
+                                      LinearBottomDims_>::BottomTensorIndex& step)
+{
+    window.move(step);
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/utility/magic_div.hpp b/include/ck_tile/core/utility/magic_div.hpp
index 09038ba29..fd9c733c5 100644
--- a/include/ck_tile/core/utility/magic_div.hpp
+++ b/include/ck_tile/core/utility/magic_div.hpp
@@ -59,8 +59,16 @@ struct magic_division32_bit_range
     CK_TILE_DEVICE static constexpr uint32_t
     do_magic_division(uint32_t dividend, uint32_t multiplier, uint32_t shift)
     {
-        uint32_t tmp = __umulhi(dividend, multiplier);
-        return (tmp + dividend) >> shift;
+        if(__builtin_is_constant_evaluated())
+        {
+            uint32_t tmp = (static_cast<uint64_t>(dividend) * multiplier) >> 32;
+            return (tmp + dividend) >> shift;
+        }
+        else
+        {
+            uint32_t tmp = __umulhi(dividend, multiplier);
+            return (tmp + dividend) >> shift;
+        }
     }
 
     CK_TILE_HOST static constexpr uint32_t
@@ -77,9 +85,18 @@ struct magic_division32_bit_range
     CK_TILE_DEVICE static constexpr int32_t
     do_magic_division(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
     {
-        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
-        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
-        return (tmp + dividend_u32) >> shift;
+        if(__builtin_is_constant_evaluated())
+        {
+            uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+            uint32_t tmp          = (static_cast<uint64_t>(dividend_u32) * multiplier) >> 32;
+            return (tmp + dividend_u32) >> shift;
+        }
+        else
+        {
+            uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+            uint32_t tmp          = __umulhi(dividend_u32, multiplier);
+            return (tmp + dividend_u32) >> shift;
+        }
     }
 
     CK_TILE_HOST static constexpr int32_t
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index dbc1f5d23..e17d7c22a 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -24,5 +24,6 @@
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
 #include "ck_tile/host/reference/reference_reduce.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
+#include "ck_tile/host/reference/reference_topk.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/timer.hpp"
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index f490bbdeb..335911860 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -10,6 +10,7 @@
 #include <random>
 #include <type_traits>
 #include <utility>
+#include <unordered_set>
 
 #include "ck_tile/core.hpp"
 
@@ -41,6 +42,73 @@ struct FillUniformDistribution
     }
 };
 
+namespace impl {
+
+// clang-format off
+template<index_t bytes> struct RawIntegerType_ {};
+template<> struct RawIntegerType_<1> { using type = uint8_t;};
+template<> struct RawIntegerType_<2> { using type = uint16_t;};
+template<> struct RawIntegerType_<4> { using type = uint32_t;};
+template<> struct RawIntegerType_<8> { using type = uint64_t;};
+// clang-format on
+
+template <typename T>
+using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type;
+} // namespace impl
+
+// Note: this struct will have no const-ness will generate random
+template <typename T>
+struct FillUniformDistribution_Unique
+{
+    float a_{-5.f};
+    float b_{5.f};
+    std::optional<uint32_t> seed_{11939};
+
+    std::mt19937 gen_{};
+    std::unordered_set<impl::RawIntegerType<T>> set_{};
+
+    FillUniformDistribution_Unique(float a                      = -5.f,
+                                   float b                      = 5.f,
+                                   std::optional<uint32_t> seed = {11939})
+        : a_(a),
+          b_(b),
+          seed_(seed),
+          gen_{seed_.has_value() ? *seed_ : std::random_device{}()},
+          set_{}
+    {
+    }
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last)
+    {
+        std::mt19937& gen = gen_;
+        std::uniform_real_distribution<float> dis(a_, b_);
+        auto& set = set_;
+        std::generate(first, last, [&dis, &gen, &set]() {
+            T v = static_cast<T>(0);
+            do
+            {
+                v = ck_tile::type_convert<T>(dis(gen));
+            } while(set.count(bit_cast<impl::RawIntegerType<T>>(v)) == 1);
+            set.insert(bit_cast<impl::RawIntegerType<T>>(v));
+
+            return v;
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range)
+        -> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+
+    void clear() { set_.clear(); }
+};
+
 template <typename T>
 struct FillNormalDistribution
 {
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index f533d5c18..5610ba324 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -11,6 +11,7 @@
 #include <thread>
 #include <utility>
 #include <vector>
+#include <functional>
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/ranges.hpp"
@@ -545,6 +546,28 @@ struct HostTensor
 
     typename Data::size_type size() const { return mData.size(); }
 
+    // return a slice of this tensor
+    // for simplicity we just copy the data and return a new tensor
+    auto slice(std::vector<size_t> s_begin, std::vector<size_t> s_end) const
+    {
+        assert(s_begin.size() == s_end.size());
+        assert(s_begin.size() == get_num_of_dimension());
+
+        std::vector<size_t> s_len(s_begin.size());
+        std::transform(
+            s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus<size_t>{});
+        HostTensor<T> sliced_tensor(s_len);
+
+        sliced_tensor.ForEach([&](auto& self, auto idx) {
+            std::vector<size_t> src_idx(idx.size());
+            std::transform(
+                idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus<size_t>{});
+            self(idx) = operator()(src_idx);
+        });
+
+        return sliced_tensor;
+    }
+
     template <typename U = T>
     auto AsSpan() const
     {
diff --git a/include/ck_tile/host/reference/reference_softmax.hpp b/include/ck_tile/host/reference/reference_softmax.hpp
index f1404f85a..d86e87994 100644
--- a/include/ck_tile/host/reference/reference_softmax.hpp
+++ b/include/ck_tile/host/reference/reference_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -9,43 +9,81 @@
 
 namespace ck_tile {
 
-template <typename ADataType, typename AccDataType, typename BDataType>
-CK_TILE_HOST void reference_softmax(const HostTensor<ADataType>& a_m_n,
-                                    HostTensor<BDataType>& b_m_n)
+template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
+CK_TILE_HOST void
+reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
 {
-    auto f = [&](auto m) {
-        const int N = a_m_n.mDesc.get_lengths()[1];
+    index_t rank = x.get_num_of_dimension();
+    assert(rank == y.get_num_of_dimension());
+    assert(dim == -1 || dim < rank);
 
-        AccDataType v_max = ck_tile::numeric<ADataType>::Lowest();
+    index_t target_dim  = dim == -1 ? (rank - 1) : dim;
+    index_t softmax_len = x.get_length(target_dim);
+    index_t n_parallel  = x.get_element_size() / softmax_len;
+    auto x_len          = x.get_lengths();
 
-        // max
-        for(int n = 0; n < N; ++n)
-        {
-            const ADataType v_a = a_m_n(m, n);
+    auto f = [&](auto i_element) {
+        std::vector<size_t> coord = [&]() {
+            std::vector<size_t> t_(rank, 0);
+            size_t r = i_element;
+            for(index_t i = rank - 1; i >= 0; i--)
+            {
+                if(i == target_dim)
+                    continue;
+                t_[i] = r % x_len[i];
+                r     = r / x_len[i];
+            }
+            return t_;
+        }();
+
+        ComputeType v_max = -ck_tile::numeric<ComputeType>::infinity();
 
-            v_max = v_max < v_a ? v_a : v_max;
+        // compute max
+        for(auto idx = 0; idx < softmax_len; idx++)
+        {
+            auto c_               = coord;
+            c_[target_dim]        = idx;
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+            v_max                 = v_max < v_x ? v_x : v_max;
         }
 
-        AccDataType v_exp_sum = 0;
+        ComputeType v_exp_sum = static_cast<ComputeType>(0);
 
         // sum
-        for(int n = 0; n < N; ++n)
+        for(auto idx = 0; idx < softmax_len; idx++)
         {
-            const ADataType v_a = a_m_n(m, n);
+            auto c_        = coord;
+            c_[target_dim] = idx;
 
-            v_exp_sum += ck_tile::exp(v_a - v_max);
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+
+            v_exp_sum += ck_tile::exp(v_x - v_max);
         }
 
         // elementwise
-        for(int n = 0; n < N; ++n)
+        for(auto idx = 0; idx < softmax_len; idx++)
         {
-            const ADataType v_a = a_m_n(m, n);
+            auto c_        = coord;
+            c_[target_dim] = idx;
+
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+
+            auto out = ck_tile::exp(v_x - v_max) / v_exp_sum;
 
-            b_m_n(m, n) = ck_tile::exp(v_a - v_max) / v_exp_sum;
+            y(c_) = ck_tile::type_convert<OutputType>(out);
         }
     };
 
-    make_ParallelTensorFunctor(f,
-                               b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
+}
+
+template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
+CK_TILE_HOST auto reference_softmax(const HostTensor<InputType>& x, index_t dim = -1)
+{
+    HostTensor<OutputType> y(x.get_lengths(), x.get_strides());
+
+    reference_softmax<InputType, ComputeType, OutputType>(x, y, dim);
+
+    return y;
 }
 } // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp
new file mode 100644
index 000000000..3d0404a2e
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_topk.hpp
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+#include <numeric>
+#include <functional>
+#include <utility>
+#include <algorithm>
+
+namespace ck_tile {
+
+/*
+    similiar to torch.topk()
+    x (Tensor) – the input tensor.
+    k (int) – the k in “top-k”
+    dim (int, optional) – the dimension to sort along
+    largest (bool, optional) – largest or smallest elements
+    sorted (bool, optional) – elements in sorted order or not
+
+    output:
+    y_values
+    y_indices
+
+    https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/TopKImpl.h
+*/
+template <typename DataType, typename IndexType = index_t>
+CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
+                                 HostTensor<DataType>& y_values,
+                                 HostTensor<IndexType>& y_indices,
+                                 index_t k,
+                                 index_t dim  = -1,
+                                 bool largest = true,
+                                 bool sorted  = true)
+{
+    // rank must be the same
+    index_t rank = x.get_num_of_dimension();
+    assert(rank == y_values.get_num_of_dimension());
+    assert(rank == y_indices.get_num_of_dimension());
+    assert(dim == -1 || dim < rank);
+
+    index_t topk_dim     = dim == -1 ? (rank - 1) : dim;
+    index_t topk_src_len = x.get_length(topk_dim);
+    auto x_len           = x.get_lengths();
+
+    assert(k <= topk_src_len);
+    assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim));
+
+    index_t n_parallel = x.get_element_size() / topk_src_len;
+
+    // clang-format off
+    auto f = [&](auto i_element) {
+        std::vector<size_t> topk_coord = [&](){
+            std::vector<size_t> t_(rank, 0);
+            size_t r = i_element;
+            for(index_t i = rank - 1; i >= 0; i--) {
+                if(i == topk_dim)          continue; // topk dim should be zero
+                t_[i] = r % x_len[i];      r = r / x_len[i];
+            }
+            return t_;
+        }();
+
+        using elem_t = std::pair<DataType, IndexType>;
+        std::vector<elem_t> q = [&](){
+            std::vector<elem_t> t_(topk_src_len);
+            for(index_t i = 0; i < topk_src_len; i++) {
+                auto c_ = topk_coord;  c_[topk_dim] = i;
+                t_[i].first = x(c_);   t_[i].second = i;
+            }
+            return t_;
+        }();
+
+        // run topk
+        if(largest) {
+            std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
+            [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
+            if(sorted) {
+                std::sort(q.begin(), q.begin() + k - 1,
+                [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
+            }
+        } else {
+            std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
+            [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
+            if(sorted) {
+                std::sort(q.begin(), q.begin() + k - 1,
+                [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
+            }
+        }
+
+        // write out
+        for(index_t i = 0; i < k; i++) {
+            auto c_ = topk_coord;  c_[topk_dim] = i;
+            y_values(c_) = q[i].first;  y_indices(c_) = q[i].second;
+        }
+    };
+    // clang-format on
+
+    make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
+}
+
+// TODO: if using this method, the return tensor would be dense(no stride)
+template <typename DataType, typename IndexType = index_t>
+CK_TILE_HOST auto reference_topk(const HostTensor<DataType>& x,
+                                 index_t k,
+                                 index_t dim  = -1,
+                                 bool largest = true,
+                                 bool sorted  = true)
+{
+    auto lens          = x.get_lengths();
+    index_t target_dim = (dim == -1) ? (lens.size() - 1) : dim;
+    assert(target_dim < lens.size());
+    assert(k <= lens[target_dim]);
+    lens[target_dim] = k;
+    HostTensor<DataType> y_values(lens);
+    HostTensor<IndexType> y_indices(lens);
+
+    reference_topk<DataType, IndexType>(x, y_values, y_indices, k, dim, largest, sorted);
+
+    return ck_tile::make_tuple(y_values, y_indices);
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise.hpp b/include/ck_tile/ops/elementwise.hpp
new file mode 100644
index 000000000..62ba9dc0b
--- /dev/null
+++ b/include/ck_tile/ops/elementwise.hpp
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
new file mode 100644
index 000000000..01217e16c
--- /dev/null
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -0,0 +1,1163 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include <type_traits>
+
+namespace ck_tile {
+namespace element_wise {
+
+#if 0
+struct PassThroughPack2
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::half2_t& y, const ck_tile::f8x2_t& x) const
+    {
+        auto t = type_convert<float2_t>(x);
+        y      = type_convert<half2_t>(t);
+    }
+    constexpr const static bool is_pack2_invocable = true;
+};
+#endif
+
+struct PassThrough
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<double, double>(double& y, const double& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, double>(float& y, const double& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<double, float>(double& y, const float& x) const
+    {
+        y = type_convert<double>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, float>(float& y, const float& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::fp16_t, ck_tile::fp16_t>(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp16_t, float>(ck_tile::fp16_t& y,
+                                                                const float& x) const
+    {
+        y = type_convert<ck_tile::fp16_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::bf16_t, ck_tile::bf16_t>(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf16_t, float>(ck_tile::bf16_t& y,
+                                                                const float& x) const
+    {
+        y = type_convert<ck_tile::bf16_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::bf16_t>(float& y,
+                                                                const ck_tile::bf16_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::bf16_t, ck_tile::fp16_t>(ck_tile::bf16_t& y, const ck_tile::fp16_t& x) const
+    {
+        y = type_convert<ck_tile::bf16_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp16_t>(float& y,
+                                                                const ck_tile::fp16_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp16_t, int8_t>(ck_tile::fp16_t& y,
+                                                                 const int8_t& x) const
+    {
+        y = type_convert<ck_tile::fp16_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf16_t, int8_t>(ck_tile::bf16_t& y,
+                                                                 const int8_t& x) const
+    {
+        y = type_convert<ck_tile::bf16_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<uint8_t, uint8_t>(uint8_t& y, const uint8_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
+    {
+        y = type_convert<int8_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int32_t, int8_t>(int32_t& y, const int8_t& x) const
+    {
+        y = type_convert<int32_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int8_t, float>(int8_t& y, const float& x) const
+    {
+        y = type_convert<int8_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, int8_t>(float& y, const int8_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int4_t, int4_t>(int4_t& y, const int4_t& x) const
+    {
+        y = x;
+    }
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int4_t, int>(int4_t& y, const int& x) const
+    {
+        y = type_convert<int4_t>(x);
+    }
+#endif
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::fp8_t, ck_tile::fp8_t>(ck_tile::fp8_t& y, const ck_tile::fp8_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp8_t>(float& y,
+                                                               const ck_tile::fp8_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp8_t, float>(ck_tile::fp8_t& y,
+                                                               const float& x) const
+    {
+        y = type_convert<ck_tile::fp8_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::fp16_t, ck_tile::fp8_t>(ck_tile::fp16_t& y, const ck_tile::fp8_t& x) const
+    {
+        y = type_convert<ck_tile::fp16_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::fp8_t, ck_tile::fp16_t>(ck_tile::fp8_t& y, const ck_tile::fp16_t& x) const
+    {
+        y = type_convert<ck_tile::fp8_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::bf8_t, ck_tile::bf8_t>(ck_tile::bf8_t& y, const ck_tile::bf8_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::bf8_t>(float& y,
+                                                               const ck_tile::bf8_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf8_t, float>(ck_tile::bf8_t& y,
+                                                               const float& x) const
+    {
+        y = type_convert<ck_tile::bf8_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::fp16_t, ck_tile::bf8_t>(ck_tile::fp16_t& y, const ck_tile::bf8_t& x) const
+    {
+        y = type_convert<ck_tile::fp16_t>(x);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::bf8_t, ck_tile::fp16_t>(ck_tile::bf8_t& y, const ck_tile::fp16_t& x) const
+    {
+        y = ck_tile::type_convert<ck_tile::bf8_t>(x);
+    }
+};
+
+#if 0
+struct UnaryConvert
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
+    {
+        y = type_convert<Y>(x);
+    }
+};
+
+struct ConvertBF16RTN
+{
+    // convert to bf16 using round to nearest (rtn)
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
+    {
+        // check Y datatype
+        static_assert(std::is_same_v<Y, ck_tile::bf16_t>, "Data type is not supported by this operation!");
+
+        // check X datatype
+        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
+                      "Data type is not supported by this operation!");
+
+        y = bf16_convert_rtn<Y>(x);
+    }
+};
+
+struct ConvertF8SR
+{
+    // convert to fp8 using stochastic rounding (SR)
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
+    {
+        // check Y datatype
+        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
+                      "Data type is not supported by this operation!");
+
+        // check X datatype
+        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
+                      "Data type is not supported by this operation!");
+
+        y = f8_convert_sr<Y>(x);
+    }
+};
+
+struct ConvertF8RNE
+{
+    // convert to fp8 using rounding to nearest even
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
+    {
+        // check Y datatype
+        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
+                      "Data type is not supported by this operation!");
+
+        // check X datatype
+        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
+                      "Data type is not supported by this operation!");
+
+        y = f8_convert_rne<Y>(x);
+    }
+};
+#endif
+
+struct Scale
+{
+    CK_TILE_HOST_DEVICE Scale(float scale = 1.f) : scale_(scale) {}
+
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
+    {
+        y = ck_tile::type_convert<Y>(ck_tile::type_convert<float>(x) * scale_);
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::fp16_t, ck_tile::fp16_t>(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const
+    {
+        y = ck_tile::type_convert<ck_tile::fp16_t>(scale_) * x;
+    };
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::bf16_t, ck_tile::bf16_t>(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const
+    {
+        const float x_tmp = ck_tile::type_convert<float>(x);
+        const float y_tmp = scale_ * x_tmp;
+        y                 = ck_tile::type_convert<ck_tile::bf16_t>(y_tmp);
+    };
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, float>(float& y, const float& x) const
+    {
+        y = scale_ * x;
+    };
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<double, double>(double& y, const double& x) const
+    {
+        y = scale_ * x;
+    };
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
+    {
+        y = ck_tile::type_convert<int8_t>(scale_ * ck_tile::type_convert<float>(x));
+    };
+
+    float scale_;
+};
+
+struct ScaleAndResetNaNToMinusInfinity
+{
+    CK_TILE_HOST_DEVICE ScaleAndResetNaNToMinusInfinity(float scale) : scale_(scale) {}
+
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, float>(float& y, const float& x) const
+    {
+        y = ck_tile::isnan(x) ? -numeric<float>::infinity() : scale_ * x;
+    };
+
+    float scale_;
+};
+
+struct UnaryDivide
+{
+    CK_TILE_HOST_DEVICE UnaryDivide(const int32_t divider = 1) : divider_(divider) {}
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = x / type_convert<T>(divider_);
+    };
+
+    int32_t divider_ = 1;
+};
+
+struct UnarySquare
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, ck_tile::fp16_t> ||
+                          std::is_same_v<T, double> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+                          || std::is_same_v<T, int4_t>
+#endif
+                      ,
+                      "Data type is not supported by this operation!");
+        y = x * x;
+    };
+};
+
+struct UnaryAbs
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::abs(x);
+    };
+};
+
+struct UnarySqrt
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::sqrt(x);
+    };
+};
+
+struct Relu
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+        y = x > 0 ? x : 0;
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const
+    {
+        float x_f32 = ck_tile::type_convert<float>(x);
+        float y_f32 = x_f32 > 0 ? x_f32 : 0;
+        y           = ck_tile::type_convert<ck_tile::bf16_t>(y_f32);
+    }
+};
+
+// Fast GeLU
+// https://paperswithcode.com/method/gelu
+// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+// host code use higher accuracy "exp" and "div"
+// gpu code use lower accuracy "_ocml_exp_f32" and "rcp" function
+struct FastGelu
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST void operator()(Y& y, const X& x) const;
+
+    template <typename Y, typename X>
+    CK_TILE_DEVICE void operator()(Y& y, const X& x) const;
+
+    template <>
+    CK_TILE_HOST void operator()<float, float>(float& y, const float& x) const
+    {
+        // const float u   = -2.f * x * (0.035677f * x * x + 0.797885f);
+        const float c1  = -2.0 * 0.035677f;
+        const float c2  = -2.0 * 0.797885f;
+        const float u   = x * (c1 * x * x + c2);
+        const float emu = exp(u);
+        y               = x / (1.f + emu);
+    }
+
+    // device code, use lower precision "__ocml_exp_f32" and "rcp"
+    template <>
+    CK_TILE_DEVICE void operator()<float, float>(float& y, const float& x) const
+    {
+        // const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const float c1  = -2.0 * 0.035677f;
+        const float c2  = -2.0 * 0.797885f;
+        const float u   = x * (c1 * x * x + c2);
+        const float emu = __ocml_exp_f32(u);
+
+        y = x * ck_tile::rcp(1.f + emu);
+    }
+
+    template <>
+    CK_TILE_HOST void operator()<ck_tile::fp16_t, ck_tile::fp16_t>(ck_tile::fp16_t& y,
+                                                                   const ck_tile::fp16_t& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, type_convert<float>(x));
+
+        y = type_convert<ck_tile::fp16_t>(y_f);
+    }
+
+    template <>
+    CK_TILE_DEVICE void operator()<ck_tile::fp16_t, ck_tile::fp16_t>(ck_tile::fp16_t& y,
+                                                                     const ck_tile::fp16_t& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, type_convert<float>(x));
+
+        y = type_convert<ck_tile::fp16_t>(y_f);
+    }
+
+    template <>
+    CK_TILE_HOST void operator()<ck_tile::fp16_t, float>(ck_tile::fp16_t& y, const float& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, x);
+
+        y = type_convert<ck_tile::fp16_t>(y_f);
+    }
+
+    template <>
+    CK_TILE_DEVICE void operator()<ck_tile::fp16_t, float>(ck_tile::fp16_t& y, const float& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, x);
+
+        y = type_convert<ck_tile::fp16_t>(y_f);
+    }
+
+    template <>
+    CK_TILE_HOST void operator()<ck_tile::bf16_t, float>(ck_tile::bf16_t& y, const float& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, x);
+
+        y = type_convert<ck_tile::bf16_t>(y_f);
+    }
+
+    template <>
+    CK_TILE_DEVICE void operator()<ck_tile::bf16_t, float>(ck_tile::bf16_t& y, const float& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, x);
+
+        y = type_convert<ck_tile::bf16_t>(y_f);
+    }
+
+    template <>
+    CK_TILE_DEVICE void operator()<ck_tile::bf16_t, ck_tile::bf16_t>(ck_tile::bf16_t& y,
+                                                                     const ck_tile::bf16_t& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, type_convert<float>(x));
+
+        y = type_convert<ck_tile::bf16_t>(y_f);
+    }
+
+    template <>
+    CK_TILE_HOST void operator()<ck_tile::bf16_t, ck_tile::bf16_t>(ck_tile::bf16_t& y,
+                                                                   const ck_tile::bf16_t& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, type_convert<float>(x));
+
+        y = type_convert<ck_tile::bf16_t>(y_f);
+    }
+};
+
+// https://paperswithcode.com/method/gelu
+// y = 0.5*x*(1+erf(x/sqrt(2)))
+struct Gelu
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<float, float>(float& y, const float& x) const
+    {
+        y = 0.5f * x * (1.f + erf(float(0.70710678118f * x)));
+    }
+
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::fp16_t, ck_tile::fp16_t>(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const
+    {
+        y = ck_tile::fp16_t(0.5) * x *
+            (ck_tile::fp16_t(1) + ck_tile::fp16_t(erf(float(0.70710678118f * x))));
+    }
+};
+
+struct Sigmoid
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+        constexpr T one = type_convert<T>(1);
+        y               = one / (one + ck_tile::exp(-x));
+    };
+};
+
+struct Silu
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+        constexpr T one = type_convert<T>(1);
+        y               = x * (one / (one + ck_tile::exp(-x)));
+    };
+};
+
+struct TanH
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::tanh(x);
+    };
+};
+
+struct ACos
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::acos(x);
+    };
+};
+
+struct Neg
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::neg(x);
+    };
+};
+
+struct ATan
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::atan(x);
+    };
+};
+
+struct Sin
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::sin(x);
+    };
+};
+
+struct ASinH
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::asinh(x);
+    };
+};
+
+struct Cos
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::cos(x);
+    };
+};
+
+struct ACosH
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::acosh(x);
+    };
+};
+
+struct Tan
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::tan(x);
+    };
+};
+
+struct ATanH
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::atanh(x);
+    };
+};
+
+struct SinH
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::sinh(x);
+    };
+};
+
+struct Ceil
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::ceil(x);
+    };
+};
+
+struct Exp
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::exp(x);
+    };
+};
+
+struct CosH
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::cosh(x);
+    };
+};
+
+struct Floor
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::floor(x);
+    };
+};
+
+struct Log
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::log(x);
+    };
+};
+
+struct ASin
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::asin(x);
+    };
+};
+
+struct Rcp
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int8_t> ||
+                          std::is_same_v<T, int32_t>,
+                      "Data type is not supported by this operation!");
+
+        y = ck_tile::rcp(x);
+    };
+};
+
+struct Swish
+{
+    Swish(float beta = 1.0f) : beta_(beta) {}
+
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
+    {
+        static_assert(std::is_same_v<X, float> || std::is_same_v<X, double> ||
+                          std::is_same_v<X, ck_tile::fp16_t>,
+                      "Data type is not supported by this operation!");
+
+        static_assert(std::is_same_v<Y, float> || std::is_same_v<Y, double> ||
+                          std::is_same_v<Y, ck_tile::fp16_t>,
+                      "Data type is not supported by this operation!");
+
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<Y>(x / (1.f + ck_tile::exp(bx)));
+    };
+
+    const float beta_;
+};
+
+struct SoftRelu
+{
+    SoftRelu(float alpha = 1.f) : alpha_(alpha){};
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+        T casted_alpha  = type_convert<T>(alpha_);
+        constexpr T one = type_convert<T>(1);
+        y               = ck_tile::log(one + ck_tile::exp(x * casted_alpha)) / casted_alpha;
+    }
+    const float alpha_;
+};
+
+struct Power
+{
+    Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
+        : alpha_(alpha), beta_(beta), gamma_(gamma){};
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+        T casted_alpha     = type_convert<T>(alpha_);
+        T casted_beta      = type_convert<T>(beta_);
+        T casted_gamma     = type_convert<T>(gamma_);
+        T shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                  = ck_tile::pow(shifted_scaled_x, casted_gamma);
+    }
+    const float alpha_;
+    const float beta_;
+    const float gamma_;
+};
+
+struct ClippedRelu
+{
+    ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){};
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        T casted_beta  = type_convert<T>(beta_);
+        y              = ck_tile::min(casted_beta, ck_tile::max(casted_alpha, x));
+    }
+    const float alpha_;
+    const float beta_;
+};
+
+struct LeakyRelu
+{
+    LeakyRelu(float alpha = 0.01f) : alpha_(alpha){};
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        y              = x >= 0 ? x : x * casted_alpha;
+    }
+    const float alpha_;
+};
+
+struct Elu
+{
+    Elu(float alpha = 1.f) : alpha_(alpha){};
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        y              = x > 0 ? x : casted_alpha * ck_tile::expm1(x);
+    }
+    const float alpha_;
+};
+
+struct Logistic
+{
+    Logistic(float alpha = 1.f) : alpha_(alpha){};
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                          std::is_same_v<T, ck_tile::fp16_t> || std::is_same_v<T, int32_t> ||
+                          std::is_same_v<T, int8_t>,
+                      "Data type is not supported by this operation!");
+        T casted_alpha  = type_convert<T>(alpha_);
+        constexpr T one = type_convert<T>(1);
+        y               = casted_alpha / (one + ck_tile::exp(-x) * casted_alpha);
+    }
+    const float alpha_;
+};
+
+struct ConvInvscale
+{
+    CK_TILE_HOST_DEVICE
+    ConvInvscale(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f)
+        : scale_in_(scale_in), scale_wei_(scale_wei), scale_out_(scale_out)
+    {
+    }
+
+    template <typename E, typename C>
+    CK_TILE_HOST_DEVICE void operator()(E& e, const C& c) const;
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp8_t, float>(ck_tile::fp8_t& e,
+                                                               const float& c) const
+    {
+        e = type_convert<ck_tile::fp8_t>(c / scale_in_ / scale_wei_ / scale_out_);
+    };
+
+    float scale_in_;
+    float scale_wei_;
+    float scale_out_;
+};
+
+struct ConvScale
+{
+    CK_TILE_HOST_DEVICE
+    ConvScale(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f)
+        : scale_in_(scale_in), scale_wei_(scale_wei), scale_out_(scale_out)
+    {
+    }
+
+    template <typename E, typename C>
+    CK_TILE_HOST_DEVICE void operator()(E& e, const C& c) const;
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp8_t, float>(ck_tile::fp8_t& e,
+                                                               const float& c) const
+    {
+        e = type_convert<ck_tile::fp8_t>(c * scale_in_ * scale_wei_ * scale_out_);
+    };
+
+    float scale_in_;
+    float scale_wei_;
+    float scale_out_;
+};
+
+struct ConvScaleRelu
+{
+    CK_TILE_HOST_DEVICE
+    ConvScaleRelu(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f)
+        : scale_in_(scale_in), scale_wei_(scale_wei), scale_out_(scale_out)
+    {
+    }
+
+    template <typename E, typename C>
+    CK_TILE_HOST_DEVICE void operator()(E& e, const C& c) const;
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp8_t, float>(ck_tile::fp8_t& e,
+                                                               const float& c) const
+    {
+        float x;
+        Relu{}.template operator()<float>(x, c * scale_in_ * scale_wei_);
+        e = type_convert<ck_tile::fp8_t>(x * scale_out_);
+    };
+
+    float scale_in_;
+    float scale_wei_;
+    float scale_out_;
+};
+
+template <typename DstType, typename SrcType>
+struct Cast
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
+    {
+        y = ck_tile::type_convert<DstType>(x);
+    };
+};
+
+// support fastconvert of int8 to fp16
+#if 0
+template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>
+struct FastNumericArrayConverter
+{
+};
+
+template <>
+struct FastNumericArrayConverter<uint8_t, ck_tile::fp16_t, 4>
+{
+    using InputArray  = vector_type<uint8_t, 4>;
+    using OutputArray = vector_type<ck_tile::fp16_t, 4>;
+
+    CK_TILE_DEVICE static OutputArray convert(InputArray const& Input)
+    {
+        OutputArray Output;
+
+        uint32_t* half_2       = reinterpret_cast<uint32_t*>(&Output);
+        uint32_t const uint8_4 = reinterpret_cast<uint32_t const&>(Input);
+
+        static constexpr uint32_t byte_selector_01 = 0x05010500;
+        static constexpr uint32_t byte_selector_23 = 0x05030502;
+        static constexpr uint32_t fp16_adder       = 0x64646464;
+        half_2[0] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_01);
+        half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23);
+
+        static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]"
+                     : "=v"(half_2[0])
+                     : "v"(half_2[0]), "s"(I8s_TO_F16s_MAGIC_NUM));
+        asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]"
+                     : "=v"(half_2[1])
+                     : "v"(half_2[1]), "s"(I8s_TO_F16s_MAGIC_NUM));
+
+        return Output;
+    }
+
+    CK_TILE_DEVICE OutputArray operator()(InputArray const& Input) { return convert(Input); }
+};
+
+template <index_t N>
+struct FastNumericArrayConverter<uint8_t, ck_tile::fp16_t, N>
+{
+    static constexpr int VEC_WIDTH = 4;
+    static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+    using InputArray  = vector_type<uint8_t, N>;
+    using OutputArray = vector_type<ck_tile::fp16_t, N>;
+
+    CK_TILE_DEVICE static OutputArray convert(InputArray const& Input)
+    {
+        FastNumericArrayConverter<uint8_t, ck_tile::fp16_t, 4> converter;
+
+        OutputArray Output;
+
+        using Vec_InputArray  = vector_type<uint8_t, 4>;
+        using Vec_OutputArray = vector_type<ck_tile::fp16_t, 4>;
+
+        Vec_OutputArray* half_4_ptr       = reinterpret_cast<Vec_OutputArray*>(&Output);
+        Vec_InputArray const* uint8_4_ptr = reinterpret_cast<Vec_InputArray const*>(&Input);
+
+        static_for<0, N / VEC_WIDTH, 1>{}(
+            [&](auto i) { half_4_ptr[i] = converter(uint8_4_ptr[i]); });
+
+        return Output;
+    }
+
+    CK_TILE_DEVICE OutputArray operator()(InputArray const& Input) { return convert(Input); }
+};
+#endif
+} // namespace element_wise
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index c4872def1..05d3dae1c 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -334,7 +334,7 @@ struct BlockFmhaPipelineQRKSVSAsync
         move_tile_window(k_dram_window, {0, kK0});
         __builtin_amdgcn_sched_barrier(0);
 
-        buffer_load_fence(k_dram_window.get_num_access(), q.get_thread_buffer());
+        buffer_load_fence(k_dram_window.get_num_of_access(), q.get_thread_buffer());
         (void)q_element_func; // ??? rocm-6.x if use q element func will have scratch on hdim=64/32
         // auto q_tile = q;      // tile_elementwise_in(q_element_func, q);
 
@@ -359,7 +359,7 @@ struct BlockFmhaPipelineQRKSVSAsync
                     if constexpr(i_k0 < k0_loops - 1)
                         move_tile_window(k_dram_window, {0, kK0});
 
-                    async_load_fence(k_dram_window.get_num_access());
+                    async_load_fence(k_dram_window.get_num_of_access());
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
                     gemm_0(s_acc,
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index a01265ad5..51d55235e 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -4,9 +4,14 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include <tuple>
 
 namespace ck_tile {
 
+/*
+ * TODO: block_tile_reduce_sync() currently has a limitation
+ * Y dim must have at least one dim not been reduced
+ */
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
 template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
@@ -104,6 +109,65 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
     });
 }
 
+/*
+ * this version is faster, using xor to do reduce, no need broadcast anymore
+ * TODO: the limitation is to-be-reduced P dim can only mapping to one R dim?
+ */
+template <typename AccDistributedTensor_, typename ReduceFunc>
+CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor,
+                                               const ReduceFunc& reduce_func)
+{
+    using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
+    using DstrEncode       = typename Dstr::DstrEncode;
+    using DstrEncodeDetail = typename DstrEncode::detail;
+
+    constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
+    constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+    constexpr index_t idim_p_lane = NDimP - 1;
+
+    constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size();
+
+    // loop over thread data
+    static_for<0, thread_buf_size, 1>{}([&](auto i) {
+        auto v_local = acc_tensor.get_thread_buffer()[i];
+
+        // cross-lane reduce for replication
+        // only reduce on R dimension correspond to lane
+        // (lane id maps to this R dimension)
+        static_for<0, NDimR, 1>{}([&](auto idim_r) {
+            // FIXME: nasty to use does_p_own_r_
+            if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
+            {
+                constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+
+                constexpr index_t lid_over_rid_derivative =
+                    DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
+
+                static_assert(is_power_of_two_integer(r_length),
+                              "wrong! only support power of 2 reduction");
+
+                constexpr index_t nstage = integer_log2_floor(r_length);
+
+                // reduction sweep forward
+                static_for<0, nstage, 1>{}([&](auto istage) {
+                    // xor
+                    index_t src_lane =
+                        __lane_id() ^ (number<lid_over_rid_derivative << istage.value>{}.value);
+
+                    // pull data from remote lane
+                    const auto v_remote = warp_shuffle(v_local, src_lane);
+
+                    // reduce
+                    v_local = reduce_func(v_local, v_remote);
+                });
+            }
+        });
+
+        acc_tensor.get_thread_buffer()(i) = v_local;
+    });
+}
+
 // FIXME: this is for 2D to 1D reduce only, need to support n-D
 template <typename AccDistributedTensor_,
           typename InDistributedTensor_,
@@ -175,6 +239,10 @@ CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor,
 #endif
 }
 
+/*
+ * TODO: block_tile_reduce() currently has a limitation
+ * Y dim must have at least one dim not been reduced
+ */
 template <typename AccDataType_,
           typename InDistributedTensor_,
           index_t... InReduceDims,
@@ -208,4 +276,106 @@ CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
     return acc_tensor;
 }
 
+// this version only support 2D->1D reduce (reduce-dim=seq<0, 1>)
+// this version only support in/acc/out datatypes are the same
+// this version will call thread/warp+sync in one function call
+//
+template <typename InDistributedTensor_>
+struct BlockReduce2D
+{
+    using InDistributedTensor = remove_cvref_t<InDistributedTensor_>;
+    using InDataType          = typename InDistributedTensor::DataType;
+
+    CK_TILE_HOST_DEVICE BlockReduce2D(const InDistributedTensor& t_, const InDataType& reduce_init_)
+        : t(t_), reduce_init(reduce_init_)
+    {
+    }
+
+    CK_TILE_HOST_DEVICE constexpr auto MakeDstBlockTile() const
+    {
+        using ReduceDim = sequence<1>; // hard coded
+        constexpr auto acc_dstr =
+            make_static_tile_distribution(ck_tile::detail::make_reduce_tile_distribution_encoding(
+                InDistributedTensor::get_tile_distribution()
+                    .get_static_tile_distribution_encoding(),
+                ReduceDim{}));
+
+        return make_static_distributed_tensor<InDataType>(acc_dstr);
+    }
+
+    // return number of pixels each lane need to reduce
+    CK_TILE_HOST_DEVICE constexpr auto get_reduce_length_y() const
+    {
+        constexpr auto spans = InDistributedTensor::get_distributed_spans();
+    }
+
+    // Here ReducePacksPerXDim is not the same meaning as that in static_uford/sweep_tile_uspan
+    // this is number of packs along the X-dim. We need to compute the Unpacks along the Y dim
+    // internally
+    // For simplicity, we just support along the row dimension, ReducePacksPerXDim is always 2
+    // element , and the first element is always ignored For simplicity, will always try from
+    // right-to-left to find alone which Y dim to split
+    template <typename ReduceFunc,
+              typename ReduceSyncFunc,
+              typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
+    CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func,
+                                        const ReduceSyncFunc& reduce_sync_func,
+                                        ReducePacksPerXDim = {}) const
+    {
+        constexpr auto spans = InDistributedTensor::get_distributed_spans();
+
+        constexpr auto row_y_unpacks = [&]() {
+            constexpr auto row_y_lengths = typename decltype(spans[number<1>{}])::Impl{};
+            constexpr auto row_y_size =
+                reduce_on_sequence(row_y_lengths, multiplies{}, number<1>{});
+            constexpr auto row_y_packs = ReducePacksPerXDim{}.at(number<1>{});
+
+            static_assert(row_y_size % row_y_packs == 0);
+
+            constexpr auto row_y_slice_size = row_y_size / row_y_packs;
+
+            constexpr auto slice_info = slice_sequence(row_y_lengths, number<row_y_slice_size>{});
+            constexpr auto unpacks    = slice_info[number<1>{}];
+            return unpacks;
+        }();
+
+        auto acc_tensor = MakeDstBlockTile();
+
+        // in-thread reduction
+        // FIXME: hard coded to be 2D to 1D reduction
+        sweep_tile_span(spans[number<0>{}], [&](auto dstr_idx_i0) {
+            constexpr auto acc_dstr_idx = make_tuple(dstr_idx_i0);
+
+            auto acc = acc_tensor[acc_dstr_idx];
+
+            sweep_tile_uspan(
+                spans[number<1>{}],
+                [&](auto... dstr_idx_i1) {
+                    acc = reduce_func(acc, t[make_tuple(dstr_idx_i0, dstr_idx_i1)]...);
+                },
+                row_y_unpacks);
+
+            acc_tensor(acc_dstr_idx) = acc;
+        });
+
+        // TODO: always use xor to do cross-lane reduce
+        block_tile_reduce_xor_sync(acc_tensor, reduce_sync_func);
+
+        return acc_tensor;
+    }
+
+    template <typename ReduceFunc>
+    CK_TILE_HOST_DEVICE auto operator()(const ReduceFunc& reduce_func) const
+    {
+        return operator()(reduce_func, reduce_func);
+    }
+
+    InDistributedTensor t;
+    InDataType reduce_init;
+};
+
+// deduction guide
+template <typename T>
+CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D(const T&, const typename T::DataType&)->BlockReduce2D<T>;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/softmax.hpp b/include/ck_tile/ops/softmax.hpp
new file mode 100644
index 000000000..584ca7068
--- /dev/null
+++ b/include/ck_tile/ops/softmax.hpp
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/softmax/block/block_softmax_2d.hpp"
+#include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp b/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp
new file mode 100644
index 000000000..607ec7eb5
--- /dev/null
+++ b/include/ck_tile/ops/softmax/block/block_softmax_2d.hpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+
+#define _BLOCK_SOFTMAX_USE_UNPACK2 0
+
+namespace ck_tile {
+
+/*
+simple 2d softmax implementation, along row (dim=1)
+requirement:
+    1). each row is within a warp
+    2). data type must be a dword
+*/
+template <typename Problem_, typename Policy_ = void>
+struct BlockSoftmax2D
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using DataType = typename Problem::DataType;
+
+    template <typename DistributedTensor, index_t dim = 1>
+    CK_TILE_DEVICE void
+    operator()(const DistributedTensor& x, DistributedTensor& y, number<dim> = {})
+    {
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        const auto f_max3 = [](auto e0, auto e1, auto e2) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, %2, %3" : "=v"(rtn) : "v"(e0), "v"(e1), "v"(e2));
+            return rtn;
+        };
+        const auto f_sum3 = [](auto e0, auto e1, auto e2) { return e0 + e1 + e2; };
+#endif
+
+        // compute row max
+        auto reduce_row_max = BlockReduce2D{x, -numeric<DataType>::infinity()};
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        auto row_max = reduce_row_max(f_max3, f_max, sequence<1, 2>{});
+#else
+        auto row_max = reduce_row_max(f_max);
+#endif
+        sweep_tile<DistributedTensor>([&](auto idx) {
+            constexpr auto row_id = make_tuple(idx[number<0>{}]);
+            y(idx)                = exp(x[idx] - row_max[row_id]);
+        });
+
+        // compute row sum
+        auto reduce_row_sum = BlockReduce2D<decltype(y)>{y, DataType{0}};
+#if _BLOCK_SOFTMAX_USE_UNPACK2
+        auto row_sum = reduce_row_sum(f_sum3, f_sum, sequence<1, 2>{});
+#else
+        auto row_sum = reduce_row_sum(f_sum);
+#endif
+        // reciprocal
+        auto r = make_static_distributed_tensor<DataType>(row_sum.get_tile_distribution());
+        sweep_tile(row_sum, [&](auto idx) { r(idx) = DataType{1} / row_sum(idx); });
+
+        // scale
+        sweep_tile<DistributedTensor>([&](auto idx) {
+            constexpr auto row_id = make_tuple(idx[number<0>{}]);
+            y(idx)                = y(idx) * r(row_id);
+        });
+    }
+
+    template <typename DistributedTensor, index_t dim = 1>
+    CK_TILE_DEVICE decltype(auto) operator()(const DistributedTensor& x, number<dim> = {})
+    {
+        auto y = DistributedTensor{}; // distributed tensor
+        operator()(x, y, number<dim>{});
+        return y;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp b/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp
new file mode 100644
index 000000000..82b9a5a48
--- /dev/null
+++ b/include/ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename DataType_>
+struct BlockSoftmax2DProblem
+{
+    using DataType = remove_cvref_t<DataType_>;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/topk.hpp b/include/ck_tile/ops/topk.hpp
new file mode 100644
index 000000000..b1143e4a0
--- /dev/null
+++ b/include/ck_tile/ops/topk.hpp
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp"
+#include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp b/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp
new file mode 100644
index 000000000..164685f98
--- /dev/null
+++ b/include/ck_tile/ops/topk/block/block_topk_stream_2d.hpp
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+/*
+simple 2d topk implementation, along row (dim=1)
+requirement:
+    1). each row is within a warp
+*/
+template <typename Problem_, typename Policy_ = void>
+struct BlockTopkStream2D
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using DataType  = typename Problem::DataType;
+    using IndexType = typename Problem::IndexType;
+
+    // TODO: if DataType is subdword, need pack into single dword to use argmax
+    struct ArgmaxPacket
+    {
+        DataType arg;
+        index_t value;
+    };
+
+    template <typename DistributedTensor, typename OutWindow, typename IdxWindow, index_t dim = 1>
+    CK_TILE_DEVICE void operator()(const DistributedTensor& x,
+                                   const OutWindow& out_window,
+                                   const IdxWindow& idx_window,
+                                   index_t k,
+                                   number<dim> = {})
+    {
+        OutWindow out_window_tmp = out_window;
+        IdxWindow idx_window_tmp = idx_window;
+        static_assert(
+            std::is_same_v<typename DistributedTensor::DataType, typename OutWindow::DataType> &&
+            std::is_same_v<typename DistributedTensor::DataType, DataType>);
+        static_assert(std::is_same_v<typename IdxWindow::DataType, IndexType>);
+
+        DistributedTensor x_tmp = x;
+        constexpr auto dst_dist = typename IdxWindow::TileDstr{};
+
+        // argmax for topk
+        const auto f_argmax = [](ArgmaxPacket e0, ArgmaxPacket e1) {
+            return e0.arg > e1.arg ? e0 : e1;
+        };
+
+        for(index_t i_k = 0; i_k < k; i_k++)
+        {
+            constexpr auto span_2d = DistributedTensor::get_distributed_spans();
+            auto packet            = [&]() {
+                auto tmp = make_static_distributed_tensor<ArgmaxPacket>(x.get_tile_distribution());
+
+                sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                        const auto tile_idx = get_x_indices_from_distributed_indices(
+                            tmp.get_tile_distribution(), make_tuple(idx0, idx1));
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                        ArgmaxPacket t;
+                        t.arg        = x_tmp(i_j_idx); // !!! we reference x here
+                        t.value      = tile_idx.at(number<1>{});
+                        tmp(i_j_idx) = t;
+                    });
+                });
+                return tmp;
+            }();
+
+            auto argmax_init = ArgmaxPacket{-numeric<DataType>::infinity(), 0};
+            auto r = block_tile_reduce<ArgmaxPacket>(packet, sequence<1>{}, f_argmax, argmax_init);
+            block_tile_reduce_xor_sync(r, f_argmax);
+
+            auto o = make_static_distributed_tensor<DataType>(dst_dist);
+            auto i = make_static_distributed_tensor<IndexType>(dst_dist);
+            sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    ArgmaxPacket tmp       = r(i_j_idx);
+                    o(i_j_idx)             = tmp.arg;
+                    i(i_j_idx)             = tmp.value;
+                });
+            });
+
+            // update value
+            sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                    const auto tile_idx = get_x_indices_from_distributed_indices(
+                        x.get_tile_distribution(), make_tuple(idx0, idx1));
+                    auto col_id = tile_idx.at(number<1>{});
+
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                    x_tmp(i_j_idx) = (col_id == r(i_j_idx).value) ? -numeric<DataType>::infinity()
+                                                                  : x_tmp(i_j_idx);
+                });
+            });
+
+            if(threadIdx.x % Problem::ColLanes == 0)
+            {
+                store_tile(out_window_tmp, o);
+                store_tile(idx_window_tmp, i);
+            }
+            move_tile_window(out_window_tmp, {number<0>{}, number<1>{}});
+            move_tile_window(idx_window_tmp, {number<0>{}, number<1>{}});
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp b/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp
new file mode 100644
index 000000000..d47188d86
--- /dev/null
+++ b/include/ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+/*
+simple 2d topk implementation, along row (dim=1)
+requirement:
+    1). each row is within a warp
+*/
+template <typename DataType_, typename IndexType_, index_t ColLanes_>
+struct BlockTopkStream2DProblem
+{
+    using DataType                    = remove_cvref_t<DataType_>;
+    using IndexType                   = remove_cvref_t<IndexType_>;
+    static constexpr index_t ColLanes = ColLanes_;
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/topk_softmax.hpp b/include/ck_tile/ops/topk_softmax.hpp
new file mode 100644
index 000000000..809473d53
--- /dev/null
+++ b/include/ck_tile/ops/topk_softmax.hpp
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
new file mode 100644
index 000000000..b8520ae61
--- /dev/null
+++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+struct TopkSoftmaxHostArgs
+{
+    const void* p_input;
+    void* p_output;
+    void* p_indices;
+    index_t num_rows;
+    index_t num_experts;
+    index_t topk;
+    index_t stride_input;  // row stride for input, at least experts
+    index_t stride_output; // row stride for output/indices, at least tpok
+};
+
+template <typename Pipeline_>
+struct TopkSoftmaxKernel
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = remove_cvref_t<typename Pipeline::Problem>;
+
+    using InputType  = typename Problem::InputType;
+    using WeightType = typename Problem::WeightType;
+    using IndexType  = typename Problem::IndexType;
+
+    struct TopkSoftmaxKargs
+    {
+        const void* p_input;
+        void* p_output;
+        void* p_indices;
+        index_t num_rows;
+        index_t num_experts;
+        index_t topk;
+        index_t stride_input;  // row stride for input, at least experts
+        index_t stride_output; // row stride for output/indices, at least tpok
+    };
+
+    using Kargs = TopkSoftmaxKargs;
+    using Hargs = TopkSoftmaxHostArgs;
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        if constexpr(Problem::LaunchType > 0)
+        {
+            int num_cu = [&]() {
+                hipDeviceProp_t dev_prop;
+                hipDevice_t dev;
+                HIP_CHECK_ERROR(hipGetDevice(&dev));
+                HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+                return dev_prop.multiProcessorCount;
+            }();
+            return dim3(num_cu * Problem::LaunchType);
+        }
+        else
+        {
+            const int num_warps = (h.num_rows + Problem::RowsPerWarp - 1) / Problem::RowsPerWarp;
+            const int num_blocks =
+                (num_warps + Problem::WarpsPerBlock - 1) / Problem::WarpsPerBlock;
+            return dim3(num_blocks);
+        }
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_input       = h.p_input;
+        k.p_output      = h.p_output;
+        k.p_indices     = h.p_indices;
+        k.num_rows      = h.num_rows;
+        k.num_experts   = h.num_experts;
+        k.topk          = h.topk;
+        k.stride_input  = h.stride_input;
+        k.stride_output = h.stride_output;
+        return k;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        index_t block_row_id = static_cast<index_t>(blockIdx.x * Problem::RowsPerBlock);
+
+        if(block_row_id > kargs.num_rows)
+            return;
+
+        index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input);
+        index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output);
+        index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id);
+
+        const auto input_window = [&]() {
+            const InputType* p_input =
+                reinterpret_cast<const InputType*>(kargs.p_input) + block_os_inp;
+
+            auto tmp = make_naive_tensor_view<address_space_enum::global>(
+                p_input,
+                make_tuple(num_rows_rem, kargs.num_experts),
+                make_tuple(kargs.stride_input, 1),
+                number<Problem::VectorSize>{},
+                number<1>{});
+
+            auto view = pad_tensor_view(
+                tmp,
+                make_tuple(number<Problem::RowsPerBlock>{}, number<Problem::Experts>{}),
+                sequence<0, 1>{}); // out-most dim no need pad(leverage oob)
+
+            return make_tile_window(
+                view,
+                make_tuple(number<Problem::RowsPerBlock>{}, number<Problem::Experts>{}),
+                {0, 0});
+        }();
+
+        auto output_window = [&]() {
+            WeightType* p_output = reinterpret_cast<WeightType*>(kargs.p_output) + block_os_out;
+            auto tmp             = make_naive_tensor_view<address_space_enum::global>(
+                p_output,
+                make_tuple(num_rows_rem, kargs.topk),
+                make_tuple(kargs.stride_output, 1),
+                number<Problem::VectorSize>{},
+                number<1>{});
+            auto view =
+                pad_tensor_view(tmp,
+                                make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}),
+                                sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob)
+                                                   // 2. we loop over topk 1-1, no need padding
+            return make_tile_window(
+                view, make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}), {0, 0});
+        }();
+
+        auto indices_window = [&]() {
+            IndexType* p_indices = reinterpret_cast<IndexType*>(kargs.p_indices) + block_os_out;
+            auto tmp             = make_naive_tensor_view<address_space_enum::global>(
+                p_indices,
+                make_tuple(num_rows_rem, kargs.topk),
+                make_tuple(kargs.stride_output, 1),
+                number<Problem::VectorSize>{},
+                number<1>{});
+            auto view =
+                pad_tensor_view(tmp,
+                                make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}),
+                                sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob)
+                                                   // 2. we loop over topk 1-1, no need padding
+            return make_tile_window(
+                view, make_tuple(number<Problem::RowsPerBlock>{}, number<1>{}), {0, 0});
+        }();
+
+        Pipeline{}(input_window,
+                   output_window,
+                   indices_window,
+                   kargs.num_rows,
+                   kargs.num_experts,
+                   kargs.topk,
+                   block_row_id);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
new file mode 100644
index 000000000..d620d9bec
--- /dev/null
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp"
+#include <string>
+#include <type_traits>
+
+#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0
+#endif
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = TopkSoftmaxWarpPerRowPolicy>
+struct TopkSoftmaxWarpPerRowPipeline
+{
+    // TODO: this kernel only support warp per row
+    using Problem    = remove_cvref_t<Problem_>;
+    using Policy     = remove_cvref_t<Policy_>;
+    using WeightType = typename Problem::WeightType;
+
+    template <typename InputWindow, typename OutputWindow, typename IndexWindow>
+    CK_TILE_DEVICE auto operator()(const InputWindow& input_window,
+                                   OutputWindow& out_window,
+                                   IndexWindow& idx_window,
+                                   index_t rows,
+                                   index_t experts,
+                                   index_t k,
+                                   index_t block_row_id)
+    {
+#if TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+        auto inp_win = make_tile_window_linear_raw(
+            input_window, Policy::template MakeInputDistribution<Problem>(), sequence<0, 1>{});
+#else
+        auto inp_win = make_tile_window_linear(
+            input_window, Policy::template MakeInputDistribution<Problem>(), sequence<0, 1>{});
+#endif
+        auto out_win = make_tile_window_linear(out_window.get_bottom_tensor_view(),
+                                               out_window.get_window_lengths(),
+                                               out_window.get_window_origin(),
+                                               Policy::template MakeOutputDistribution<Problem>());
+        auto idx_win = make_tile_window_linear(idx_window.get_bottom_tensor_view(),
+                                               idx_window.get_window_lengths(),
+                                               idx_window.get_window_origin(),
+                                               Policy::template MakeOutputDistribution<Problem>());
+
+        auto softmax = Policy::template GetSoftmax<Problem>();
+        auto topk    = Policy::template GetTopk<Problem>();
+
+        const index_t grid_rows_per_loop = gridDim.x * Problem::RowsPerBlock;
+
+        while(1)
+        {
+#if TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+            __builtin_amdgcn_sched_barrier(0);
+            auto x =
+                load_tile_raw(inp_win, number<-1>{}, bool_constant<true>{}, bool_constant<true>{});
+            buffer_load_fence(number<0>{});
+            __builtin_amdgcn_sched_barrier(0);
+#else
+            auto x = load_tile(inp_win);
+#endif
+            // cast and pad input data
+            auto w = [&]() {
+#if 0
+                auto w_ = cast_tile<WeightType>(x);
+
+                constexpr auto span_2d = decltype(w_)::get_distributed_spans();
+                sweep_tile_span(span_2d[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(span_2d[number<1>{}], [&](auto idx1) {
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                        const auto x_indices   = get_x_indices_from_distributed_indices(
+                            w_.get_tile_distribution(), i_j_idx);
+                        const auto current_expert = x_indices.at(number<1>{});
+                        // set to -INF if OOB so that later softmax can work properly
+                        w_(i_j_idx) = current_expert >= experts ? -numeric<WeightType>::infinity()
+                                                                : w_(i_j_idx);
+                    });
+                });
+                return w_;
+#else
+                auto w_  = make_static_distributed_tensor<WeightType>(x.get_tile_distribution());
+                auto w_f = [&](auto idx) {
+                    w_(idx) = type_convert<WeightType>(x(idx));
+                    const auto x_indices =
+                        get_x_indices_from_distributed_indices(w_.get_tile_distribution(), idx);
+                    const auto current_expert = x_indices.at(number<1>{});
+                    w_(idx) =
+                        current_expert >= experts ? -numeric<WeightType>::infinity() : w_(idx);
+                };
+                tile_sweeper ts{w_, w_f};
+                ts();
+                return w_;
+#endif
+            }();
+
+            // softmax
+            auto y = softmax(w);
+
+            topk(y, out_win, idx_win, k);
+
+            // check exit
+            if constexpr(Problem::LaunchType == 0)
+            {
+                break;
+            }
+            else
+            {
+                block_row_id += grid_rows_per_loop;
+                if(block_row_id >= rows)
+                    break;
+            }
+
+            move_tile_window(inp_win, {grid_rows_per_loop, number<0>{}});
+            move_tile_window(out_win, {grid_rows_per_loop, number<0>{}});
+            move_tile_window(idx_win, {grid_rows_per_loop, number<0>{}});
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp
new file mode 100644
index 000000000..a6e886bd3
--- /dev/null
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/softmax.hpp"
+#include "ck_tile/ops/topk.hpp"
+
+namespace ck_tile {
+
+struct TopkSoftmaxWarpPerRowPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
+    {
+        // TODO: Y dim must have one dim that is not reduced
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<1>,
+                tuple<sequence<Problem::IssuesPerCol,
+                               Problem::WarpsPerBlock,
+                               Problem::RowsPerWarpPerColIssue>,
+                      sequence<Problem::IssuesPerRow, Problem::LanesPerRow, Problem::VectorSize>>,
+                tuple<sequence<1>, sequence<1, 2>>,
+                tuple<sequence<1>, sequence<2, 1>>,
+                sequence<1, 2, 2>,
+                sequence<0, 0, 2>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<Problem::LanesPerRow>, // repeat this one
+                                       tuple<sequence<Problem::IssuesPerCol,
+                                                      Problem::WarpsPerBlock,
+                                                      Problem::RowsPerWarpPerColIssue>,
+                                             sequence<1>>, // each row write out single element
+                                       tuple<sequence<1>, sequence<1, 0>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSoftmax()
+    {
+        using softmax_problem = BlockSoftmax2DProblem<typename Problem::WeightType>;
+        return BlockSoftmax2D<softmax_problem>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetTopk()
+    {
+        using topk_problem = BlockTopkStream2DProblem<typename Problem::WeightType,
+                                                      typename Problem::IndexType,
+                                                      Problem::LanesPerRow>;
+        // Note: replicate is LanesPerRow
+        return BlockTopkStream2D<topk_problem>{};
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp
new file mode 100644
index 000000000..917096ad5
--- /dev/null
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename InputType_,
+          typename WeightType_,
+          typename IndexType_,
+          index_t Experts_,
+          index_t IssuesPerCol_  = 2, // issue along col, to make sure block_reduce() OK
+          index_t BytesPerIssue_ = sizeof(InputType_),
+          index_t LaunchType_    = 0, // 0-streaming, >0, persistent #occupancy
+          index_t BlockSize_     = 256>
+struct TopkSoftmaxWarpPerRowProblem
+{
+    // TODO: this kernel only support warp per row
+    using InputType  = remove_cvref_t<InputType_>;
+    using WeightType = remove_cvref_t<WeightType_>;
+    using IndexType  = remove_cvref_t<IndexType_>;
+
+    static constexpr index_t LaunchType    = LaunchType_;
+    static constexpr index_t Experts       = Experts_;
+    static constexpr index_t BytesPerIssue = BytesPerIssue_;
+    static constexpr index_t IssuesPerCol  = IssuesPerCol_;
+    static constexpr index_t BlockSize     = BlockSize_;
+    static constexpr index_t WarpSize      = get_warp_size();
+
+    static_assert(BytesPerIssue % sizeof(InputType) == 0);
+    static constexpr index_t VectorSize = BytesPerIssue / sizeof(InputType);
+    static_assert(Experts % VectorSize == 0);
+    static constexpr index_t LanesPerRow = min(Experts / VectorSize, WarpSize);
+    static_assert(WarpSize % LanesPerRow == 0);
+    static constexpr index_t RowsPerWarpPerColIssue = WarpSize / LanesPerRow;
+    static constexpr index_t RowsPerWarp            = IssuesPerCol * RowsPerWarpPerColIssue;
+    static constexpr index_t IssuesPerRow           = Experts / (LanesPerRow * VectorSize);
+
+    static constexpr index_t WarpsPerBlock = BlockSize / WarpSize;
+    static constexpr index_t RowsPerBlock  = RowsPerWarp * WarpsPerBlock;
+};
+} // namespace ck_tile
-- 
GitLab


From 922e42a039a42770446c42fabc62fe1e7b050625 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 28 Oct 2024 19:02:48 -0700
Subject: [PATCH 021/153] fix compilation errors for gfx12 with clang20 (#1606)

---
 include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
index 3ea19da74..fa389c340 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -352,7 +352,7 @@ struct BlockwiseGemmWMMA
                             constexpr index_t c_offset =
                                 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
 
-                            wmma_gemm.template Run(
+                            wmma_gemm.template Run<>(
                                 a_thread_vec.template AsType<wmma_input_type_a>(),
                                 b_thread_vec.template AsType<wmma_input_type_b>(),
                                 c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
@@ -406,7 +406,7 @@ struct BlockwiseGemmWMMA
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
 
-                        wmma_gemm.template Run(
+                        wmma_gemm.template Run<>(
                             a_thread_vec.template AsType<wmma_input_type_a>(),
                             b_thread_vec.template AsType<wmma_input_type_b>(),
                             c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-- 
GitLab


From 9fbd72e97e34f530ae370527755b655bf390d9ee Mon Sep 17 00:00:00 2001
From: valarLip <103567126+valarLip@users.noreply.github.com>
Date: Tue, 29 Oct 2024 18:05:53 +0800
Subject: [PATCH 022/153] [CK_TILE] add generic_permute (#1607)

---
 example/ck_tile/06_permute/CMakeLists.txt     |  13 +
 example/ck_tile/06_permute/README.md          |  46 ++
 .../alternative_impl/matrix_core_swizzle.cpp  |  98 +++++
 .../alternative_impl/matrix_core_swizzle.hpp  |  20 +
 .../matrix_core_swizzle_kernel.hpp            | 413 ++++++++++++++++++
 example/ck_tile/06_permute/permute.cpp        | 411 +++++++++++++++++
 example/ck_tile/06_permute/permute.hpp        |  19 +
 .../ck_tile/06_permute/script/smoke_test.sh   |  34 ++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/host.hpp                      |   1 +
 .../host/reference/reference_permute.hpp      |  57 +++
 include/ck_tile/ops/permute.hpp               |   8 +
 .../permute/kernel/generic_permute_kernel.hpp | 169 +++++++
 .../pipeline/generic_petmute_problem.hpp      |  28 ++
 14 files changed, 1318 insertions(+)
 create mode 100644 example/ck_tile/06_permute/CMakeLists.txt
 create mode 100644 example/ck_tile/06_permute/README.md
 create mode 100644 example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
 create mode 100644 example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
 create mode 100644 example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
 create mode 100644 example/ck_tile/06_permute/permute.cpp
 create mode 100644 example/ck_tile/06_permute/permute.hpp
 create mode 100644 example/ck_tile/06_permute/script/smoke_test.sh
 create mode 100644 include/ck_tile/host/reference/reference_permute.hpp
 create mode 100644 include/ck_tile/ops/permute.hpp
 create mode 100644 include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
 create mode 100644 include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp

diff --git a/example/ck_tile/06_permute/CMakeLists.txt b/example/ck_tile/06_permute/CMakeLists.txt
new file mode 100644
index 000000000..327fceb68
--- /dev/null
+++ b/example/ck_tile/06_permute/CMakeLists.txt
@@ -0,0 +1,13 @@
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
+
+if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
+# set(PERMUTE_USE_ALTERNATIVE_IMPL false)
+set(PERMUTE_USE_ALTERNATIVE_IMPL true)
+endif()
+if(PERMUTE_USE_ALTERNATIVE_IMPL)
+target_compile_options(tile_example_permute PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL)
+target_sources(tile_example_permute PRIVATE alternative_impl/matrix_core_swizzle.cpp)
+endif()
+# target_compile_options(tile_example_permute PRIVATE -v --save-temps -Wno-gnu-line-marker)
diff --git a/example/ck_tile/06_permute/README.md b/example/ck_tile/06_permute/README.md
new file mode 100644
index 000000000..03bd810ff
--- /dev/null
+++ b/example/ck_tile/06_permute/README.md
@@ -0,0 +1,46 @@
+# permute
+
+This folder contains example for permute kernel, which is similiar to [torch.permute](https://pytorch.org/docs/stable/generated/torch.permute.html) (combined with [torch.contiguous](https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html)). Currently we implement a generic permute kernel that support up to rank 8 arbitrary permutation with a single kernel instance. Performance is not the first consideration, we prefer a simple and general kernel implementation using `ck_tile` in this example.
+
+
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -prec    data type. fp16/bf16/fp32 (default:fp16)
+      -shape    the shape of the input tensor (default:2,3,4)
+       -perm    permute perm (default:2,1,0)
+```
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_permute -j
+```
+This will result in an executable `build/bin/tile_example_permute`
+
+
+## some examples
+```
+# torch
+x=torch.randn(2,3,4,6)
+y=x.permute(0,3,2,1).contiguous()
+
+# ck_tile
+./build/bin/tile_example_permute -shape=2,3,4,6 -perm=0,3,2,1
+```
+
+or you can try the smoke_test
+```
+# in the root of ck_tile, after you build this example
+sh example/ck_tile/06_permute/script/smoke_test.sh
+```
+
+### alternative implementation
+we have an alternative implementation under `alternative_impl/` folder, that can swizzle the tensor to be more friendly for data loading for matrix core layout. This can be enabled when dealing with a `rank-7` tensor, with a fixed pattern of either `0,1,4,2,5,3,6` or `0,1,2,4,5,3,6`. There are other shape limitation of this implementation, check the source code of `permute.cpp` for detail.
+```
+# example
+./build/bin/tile_example_permute -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6 # b_n0_k0_n1_k1_n2_k2
+./build/bin/tile_example_permute -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6 # b_n0_n1_k0_k1_n2_k2
+```
diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
new file mode 100644
index 000000000..93c662a28
--- /dev/null
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
@@ -0,0 +1,98 @@
+#include "matrix_core_swizzle.hpp"
+#include "matrix_core_swizzle_kernel.hpp"
+
+float matrix_core_swizzle(matrix_core_swizzle_traits t,
+                          matrix_core_swizzle_args a,
+                          const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp16") == 0)
+    {
+        if(t.inst.compare("32x32x8") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,3,4,2,5") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+        }
+        else if(t.inst.compare("16x16x16") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,3,4,2,5") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+        }
+    }
+    return -1;
+}
diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
new file mode 100644
index 000000000..e1ecdbbe6
--- /dev/null
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "matrix_core_swizzle_kernel.hpp"
+#include <string>
+
+struct matrix_core_swizzle_traits
+{
+    std::string data_type; // fp16 only
+    std::string inst;      // 32x32x8, 16x16x16
+    std::string permute;   //
+};
+
+using matrix_core_swizzle_args = matrix_core_swizzle_host_args;
+
+// host API
+float matrix_core_swizzle(matrix_core_swizzle_traits,
+                          matrix_core_swizzle_args,
+                          const ck_tile::stream_config&);
diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
new file mode 100644
index 000000000..60ac103ec
--- /dev/null
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+// if set to 1, slightly more instructions generated to calculate address
+#ifndef MERGE_2D_013425
+#define MERGE_2D_013425 0
+#endif
+
+enum class matrix_core_inst_enum
+{
+    MFMA_32x32x8_F16  = 0,
+    MFMA_16x16x16_F16 = 1,
+};
+
+namespace detail {
+template <matrix_core_inst_enum>
+struct to_warp_gemm;
+
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_32x32x8_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M32N32K8;
+};
+
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_16x16x16_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M16N16K16;
+};
+} // namespace detail
+template <matrix_core_inst_enum Inst>
+using to_warp_gemm_t = typename detail::to_warp_gemm<Inst>::type;
+
+// TODO: in below permute pattern, the last 3 dim is within wave
+enum class matrix_core_permute_style
+{
+    permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6
+    permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6
+    permute_b_nr_kr_kw_nw_kv    = 2, // 0,1,3,4,2,5
+    permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv,
+};
+
+// assume this is B matrix, originally we have batch*n*k
+// now batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+// assume using 32x32x8-f16, 4 waves and extend the KPerLane to 8xfp16(dwordx4)
+//
+//                                      4(waves)  32(mfma_m lane)
+//                                          |      |
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2 -> 8(thread loading)
+//                                    nr  kr    |
+//        nr  4  32 kr 2  8                     2(klane)
+//
+// permute: 0,1,4,2,5,3,6
+// or
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*n1*k0*k1*n2*k2 -> 8(thread loading)
+// permute: 0,1,2,4,5,3,6
+//
+// this kernel only deal with fp16/bf16 data(16bit), and use 2d block size to do the swizzling
+// for simplicity, only consider n/k is multiple of block-size
+
+// independend host arg with no template
+struct matrix_core_swizzle_host_args
+{
+    const void* p_src;
+    void* p_dst;
+    int32_t batch;
+    int32_t n;
+    int32_t k;
+};
+
+// NOTE: this kernel could follow the style of generic permute kernel
+// but here we pass in fixed layout as template arg and generate different kernel instance
+// purposely
+template <int BLOCK_SIZE_ = 256,
+          int NPerBlock_  = 256,
+          int KPerBlock_  = 128,
+          matrix_core_permute_style pstyle_ =
+              matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2,
+          matrix_core_inst_enum Inst_ = matrix_core_inst_enum::MFMA_32x32x8_F16>
+struct matrix_core_swizzle_kernel
+{
+    using karg = matrix_core_swizzle_host_args;
+    using harg = matrix_core_swizzle_host_args;
+
+    static constexpr int BLOCK_SIZE      = BLOCK_SIZE_;
+    static constexpr int WavesPerBlock_N = 4;
+    static constexpr int WavesPerBlock_K = 1;
+    static_assert(WavesPerBlock_N * WavesPerBlock_K * 64 == BLOCK_SIZE);
+    static constexpr int NPerBlock                    = NPerBlock_;
+    static constexpr int KPerBlock                    = KPerBlock_;
+    static constexpr matrix_core_permute_style pstyle = pstyle_;
+    static constexpr matrix_core_inst_enum Inst       = Inst_;
+
+    static constexpr ck_tile::index_t Alignment = 8;
+    karg a;
+    dim3 grids;
+
+    using WarpGemm = to_warp_gemm_t<Inst>;
+
+    __host__ matrix_core_swizzle_kernel(harg h)
+    {
+        a                   = h;
+        ck_tile::index_t ns = (h.n + NPerBlock - 1) / NPerBlock;
+        ck_tile::index_t ks = (h.k + KPerBlock - 1) / KPerBlock;
+        grids               = dim3(ks, ns, h.batch);
+    }
+
+    __host__ bool is_applicable(harg h) { return h.n % NPerBlock == 0 && h.k % KPerBlock == 0; }
+
+    __host__ void operator()(const ck_tile::stream_config& s) const
+    {
+        ck_tile::kentry<BLOCK_SIZE, 1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
+    }
+
+    struct kernel
+    {
+        __device__ static constexpr auto get_src_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+
+            // clang-format off
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,// 0
+                    //             1              2            3             4             5             6
+                    tuple<sequence<N0>, sequence<N1>, sequence<N2>, sequence<K0>, sequence<K1>, sequence<K2>>,
+
+                    //            N1           K1  N2
+                    tuple<sequence<2>, sequence<5, 3>>,
+                    tuple<sequence<0>, sequence<0, 0>>,
+
+                    //       N0 K0 K2
+                    sequence<1, 4, 6>,
+                    sequence<0, 0, 0>>{});
+            // clang-format on
+        }
+        __device__ static constexpr auto get_dst_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+
+            if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<K0>, sequence<N1>, sequence<K1>, sequence<N2>, sequence<K2>>,
+
+                        //            N1           K1  N2
+                        tuple<sequence<3>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+
+                        //       N0 K0 K2
+                        sequence<1, 2, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+            else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<N1>, sequence<K0>, sequence<K1>, sequence<N2>, sequence<K2>>,
+
+                        //            N1           K1  N2
+                        tuple<sequence<2>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+
+                        //       N0 K0 K2
+                        sequence<1, 3, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+            else
+            {
+                // clang-format off
+                // permute_b_nr_kr_kw_nw_kv or permute_b_nr_kr_waveflatten
+                constexpr index_t Kv = Alignment;
+                constexpr index_t Nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                constexpr index_t Kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+
+                static_assert(KPerBlock % (K1 * K2) == 0);
+                constexpr index_t Nr = NPerBlock / Nw;
+                constexpr index_t Kr = KPerBlock / (Kv * Kw);
+
+                constexpr index_t Nr_p = WavesPerBlock_N;
+                constexpr index_t Kr_p = WavesPerBlock_K;
+                constexpr index_t Nr_y = Nr / Nr_p;
+                constexpr index_t Kr_y = Kr / Kr_p;
+
+                return make_static_tile_distribution(
+#if MERGE_2D_013425
+                    tile_distribution_encoding<
+                        sequence<1>,// 0    R
+                        // major       1                         2
+                        // minor       0     1     2             0     1     2   3
+                        tuple<sequence<Nr_y, Nr_p, Nw>, sequence<Kr_y, Kr_p, Kw, Kv>>,    // H
+
+                        //            Nr_p, Kr_p         Kw Nw
+                        tuple<sequence<1  , 2>, sequence<2, 1>>,    // p major
+                        tuple<sequence<1  , 1>, sequence<2, 2>>,    // p minor
+
+                        //       Nr_y Kr_y Kv
+                        sequence<1,   2,   2>,          // Y major
+                        sequence<0,   0,   3>>{});      // y minor
+#else
+                    tile_distribution_encoding<
+                        sequence<1>,// 0    R
+                        // major       1                     2                     3
+                        // minor       0     1               0     1               0   1   2
+                        tuple<sequence<Nr_y, Nr_p>, sequence<Kr_y, Kr_p>, sequence<Kw, Nw, Kv>>,    // H
+
+                        //            Nr_p, Kr_p         Kw Nw
+                        tuple<sequence<1  , 2>, sequence<3, 3>>,    // p major
+                        tuple<sequence<1  , 1>, sequence<0, 1>>,    // p minor
+
+                        //       Nr_y Kr_y Kv
+                        sequence<1,   2,   3>,          // Y major
+                        sequence<0,   0,   2>>{});      // y minor
+#endif
+                // clang-format on
+            }
+        }
+
+        __device__ void operator()(karg a_)
+        {
+            using namespace ck_tile;
+            index_t i_k = blockIdx.x;
+            index_t i_n = blockIdx.y;
+            index_t i_b = blockIdx.z;
+
+            constexpr index_t k2 = Alignment;
+            constexpr index_t n2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1 = BLOCK_SIZE / get_warp_size();
+            const index_t k0     = a_.k / (k1 * k2);
+            const index_t n0     = a_.n / (n1 * n2);
+
+            constexpr index_t k2_tile = Alignment;
+            constexpr index_t n2_tile = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1_tile = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1_tile = BLOCK_SIZE / get_warp_size();
+            constexpr index_t k0_tile = KPerBlock / (k1_tile * k2_tile);
+            constexpr index_t n0_tile = NPerBlock / (n1_tile * n2_tile);
+
+            const fp16_t* p_src = reinterpret_cast<const fp16_t*>(a_.p_src) + i_b * a_.k * a_.n;
+            fp16_t* p_dst       = reinterpret_cast<fp16_t*>(a_.p_dst) + i_b * a_.k * a_.n;
+
+            const auto src_view = [&]() {
+                const auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                    p_src,
+                    make_tuple(n0, n1, n2, k0, k1, k2),
+                    number<Alignment>{}); // control vector load
+                return tmp;
+            }();
+
+            const auto src_window = make_tile_window(src_view,
+                                                     make_tuple(number<n0_tile>{},
+                                                                number<n1_tile>{},
+                                                                number<n2_tile>{},
+                                                                number<k0_tile>{},
+                                                                number<k1_tile>{},
+                                                                number<k2_tile>{}),
+                                                     {i_n * n0_tile, 0, 0, i_k * k0_tile, 0, 0},
+                                                     get_src_dist());
+
+            auto dst_view = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, k0, n1, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+                else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, n1, k0, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+                else
+                {
+#if MERGE_2D_013425
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    // constexpr index_t waveflatten = kw*nw*kv;
+                    const index_t kr = a_.k / (k1 * k2);
+                    const index_t nr = a_.n / nw;
+                    auto tmp         = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(nr, kr, number<kw>{}, number<nw>{}, number<kv>{}),
+                        number<Alignment>{}); // control vector load
+                    auto tmp_1 = transform_tensor_view(
+                        tmp,
+                        make_tuple(
+                            make_merge_transform(make_tuple(nr, number<nw>{})),
+                            make_merge_transform(make_tuple(kr, number<kw>{}, number<kv>{}))),
+                        make_tuple(sequence<0, 3>{}, sequence<1, 2, 4>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                    return tmp_1;
+#else
+                    // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv,
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t waveflatten = kw * nw * kv;
+                    const index_t kr = a_.k / (k1 * k2);
+                    const index_t nr = a_.n / nw;
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(nr, kr, waveflatten),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+#endif
+                }
+            }();
+
+            auto dst_window = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, i_k * k0_tile, 0, 0, 0, 0},
+                                            get_dst_dist());
+                }
+                else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, 0, i_k * k0_tile, 0, 0, 0},
+                                            get_dst_dist());
+                }
+                else
+                {
+#if MERGE_2D_013425
+                    // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                            {i_n * NPerBlock, i_k * KPerBlock},
+                                            get_dst_dist());
+#else
+                    // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t waveflatten_tile = kw * nw * kv;
+                    constexpr index_t nr_tile = NPerBlock / nw;
+                    constexpr index_t kr_tile = KPerBlock / (kw * kv);
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<nr_tile>{},
+                                                       number<kr_tile>{},
+                                                       number<waveflatten_tile>{}),
+                                            {i_n * nr_tile, i_k * kr_tile, 0},
+                                            get_dst_dist());
+#endif
+                }
+            }();
+
+            // actual load store
+            auto src_tile = load_tile(src_window);
+
+            // now we only swap the distribution from src to dst, no extra movement occurs
+            auto dst_tile                = make_static_distributed_tensor<fp16_t>(get_dst_dist());
+            dst_tile.get_thread_buffer() = src_tile.get_thread_buffer();
+
+            // final store
+            store_tile(dst_window, dst_tile);
+        }
+    };
+};
diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp
new file mode 100644
index 000000000..af95b64e6
--- /dev/null
+++ b/example/ck_tile/06_permute/permute.cpp
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "permute.hpp"
+#include "ck_tile/host.hpp"
+
+#include <array>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
+
+namespace detail {
+template <int bytes>
+struct to_integer_type;
+
+template <>
+struct to_integer_type<4>
+{
+    using type = int32_t;
+};
+template <>
+struct to_integer_type<2>
+{
+    using type = int16_t;
+};
+template <>
+struct to_integer_type<1>
+{
+    using type = int8_t;
+};
+} // namespace detail
+
+template <int bytes>
+using to_integer_type = typename detail::to_integer_type<bytes>::type;
+
+// host API (shoule come from codegen)
+float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp8") == 0)
+    {
+        using DataType        = ck_tile::fp8_t;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids      = Kernel::GridSize(a);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+    else if(t.data_type.compare("fp16") == 0)
+    {
+        using DataType        = ck_tile::half_t;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids      = Kernel::GridSize(a);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+    else if(t.data_type.compare("fp32") == 0)
+    {
+        using DataType        = float;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids      = Kernel::GridSize(a);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+
+    return 0;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("prec", "fp16", "data type. fp8/fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("shape", "2,3,4", "the shape of the input tensor")
+        .insert("perm", "2,1,0", "permute perm")
+        .insert("kname", "0", "t to 1 will print kernel name")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 for "
+                "non-deterministic seed")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+// "1,2,3,4" -> vector{1,2,3,4}
+std::vector<ck_tile::index_t> decode_vec(std::string q_val)
+{
+#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
+    std::string::size_type pos = 0;
+    std::vector<ck_tile::index_t> v;
+    while(true)
+    {
+        auto found = q_val.find(',', pos);
+        ck_tile::index_t n =
+            _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos));
+        v.push_back(n);
+        if(found == std::string::npos)
+        {
+            break;
+        }
+        pos = found + 1;
+    }
+    return v;
+#undef _S2I_
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+
+    auto shape        = decode_vec(arg_parser.get_str("shape"));
+    auto perm         = decode_vec(arg_parser.get_str("perm"));
+    int stream_warmup = arg_parser.get_int("warmup");
+    int stream_repeat = arg_parser.get_int("repeat");
+    bool kname        = arg_parser.get_bool("kname");
+    int seed          = arg_parser.get_int("seed");
+
+    assert(shape.size() == perm.size());
+    ck_tile::index_t rank = perm.size();
+    if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks)
+    {
+        printf("rank %d permute is not support yet\n", rank);
+        return false;
+    }
+
+    ck_tile::HostTensor<DataType> x(shape);
+    ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
+
+    std::vector<ck_tile::index_t> y_shape = [&]() {
+        std::vector<ck_tile::index_t> tmp(rank, 0);
+        // std::cout << "@@@@" << tmp << std::endl;
+        for(int i = 0; i < static_cast<int>(rank); i++)
+        {
+            // std::cout << "  i:" << i << ", perm:" << perm[i] << ", rak:" <<
+            // static_cast<int>(rank)
+            // << std::endl;
+            tmp[i] = shape[perm[i]];
+        }
+        // std::cout << "@@@" << tmp << std::endl;
+        return tmp;
+    }();
+
+    ck_tile::HostTensor<DataType> y(y_shape);
+
+    ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x.data());
+
+    std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape << ", permute:" << perm
+              << std::flush;
+
+    ck_tile::stream_config stream_config{nullptr,
+                                         true,
+                                         /* log_level = */ (kname ? 1 : 0),
+                                         stream_warmup,
+                                         stream_repeat};
+    float ave_time   = 0.f;
+    auto run_permute = [&]() {
+        permute_traits t;
+        t.data_type = data_type;
+
+        permute_args a;
+        a.p_src = x_buf.GetDeviceBuffer();
+        a.p_dst = y_buf.GetDeviceBuffer();
+        a.rank  = rank;
+        std::copy(shape.begin(), shape.end(), a.shape);
+        std::copy(perm.begin(), perm.end(), a.perm);
+
+        return permute(t, a, stream_config);
+    };
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+    // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+    if((arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") ||
+        arg_parser.get_str("perm") == std::string("0,1,2,4,5,3,6") ||
+        arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")))
+    {
+        if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5"))
+        {
+            // permute_b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
+            matrix_core_swizzle_traits t;
+            t.data_type = data_type;
+            t.permute   = arg_parser.get_str("perm");
+
+            matrix_core_swizzle_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.batch = shape[0];
+
+            auto nr = shape[1];
+            auto nw = shape[2];
+            auto kr = shape[3];
+            auto kw = shape[4];
+            auto kv = shape[5];
+            a.n     = nr * nw;
+            a.k     = kr * kw * kv;
+            if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0)
+            {
+                t.inst = "16x16x16";
+                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0)
+            {
+                t.inst = "32x32x8";
+                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else
+            {
+                ave_time = run_permute();
+            }
+        }
+        else
+        {
+            matrix_core_swizzle_traits t;
+            t.data_type = data_type;
+            t.permute   = arg_parser.get_str("perm");
+
+            matrix_core_swizzle_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.batch = shape[0];
+            a.n     = shape[1] * shape[2] * shape[3];
+            a.k     = shape[4] * shape[5] * shape[6];
+            if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 &&
+               shape[4] % 8 == 0 && shape[1] % 2 == 0)
+            {
+                // 32x32x8 inst
+                // perm=0,1,4,2,5,3,6
+                // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
+                // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
+
+                t.inst = "32x32x8";
+                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
+                    shape[4] % 4 == 0 && shape[1] % 4 == 0)
+            {
+                // 16x16x16 inst
+                // perm=0,1,4,2,5,3,6
+                // y_shape=*,4x,4x,4,4,16,8
+                // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
+                t.inst = "16x16x16";
+                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else
+            {
+                ave_time = run_permute();
+            }
+        }
+    }
+    else
+#endif
+    {
+        ave_time = run_permute();
+    }
+    std::cout << ", time:" << ave_time << "ms" << std::flush;
+
+    bool pass = true;
+    if(do_validation)
+    {
+        reference_permute(x, y, perm);
+#if 0
+        if constexpr (std::is_same_v<float, DataType>){
+            // using itype = to_integer_type<sizeof(DataType)>;
+            fflush(stdout);
+            for(int zz = 0; zz < static_cast<int>(x.get_element_size()); zz++   ) {
+                printf("%3.0f ", x.mData[zz]);
+            }
+            printf("->\n");
+            for(int zz = 0; zz < static_cast<int>(x.get_element_size()); zz++   ) {
+                printf("%3.0f ", y.mData[zz]);
+            }
+            fflush(stdout);
+        }
+#endif
+        ck_tile::HostTensor<DataType> y_dev(y.get_lengths());
+
+        y_buf.FromDevice(y_dev.data());
+
+        pass = std::equal(
+            y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) {
+                using itype = to_integer_type<sizeof(DataType)>;
+                itype i_d   = ck_tile::bit_cast<itype>(d);
+                itype i_h   = ck_tile::bit_cast<itype>(h);
+                return i_d == i_h;
+            });
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+    }
+
+    std::cout << std::endl;
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp8")
+    {
+        return run<ck_tile::fp8_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp32")
+    {
+        return run<float>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/06_permute/permute.hpp b/example/ck_tile/06_permute/permute.hpp
new file mode 100644
index 000000000..304da4dc9
--- /dev/null
+++ b/example/ck_tile/06_permute/permute.hpp
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/permute.hpp"
+#include <string>
+
+struct permute_traits
+{
+    std::string data_type;
+};
+
+using permute_args = ck_tile::GenericPermuteHostArgs;
+
+// host API
+float permute(permute_traits, permute_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/06_permute/script/smoke_test.sh b/example/ck_tile/06_permute/script/smoke_test.sh
new file mode 100644
index 000000000..793e52d2b
--- /dev/null
+++ b/example/ck_tile/06_permute/script/smoke_test.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+# TODO: run this script from CK root
+BUILD=build
+EXE=$BUILD/bin/tile_example_permute
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+# mode=0
+# export HIP_VISIBLE_DEVICES=4
+if [ $# -ge 1 ] ; then
+    set -x
+fi
+
+$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=2,8,16,8,4,8 -perm=0,1,3,4,2,5  $COMMON_ARGS
+$EXE -prec=fp16 -shape=1,24,32,16,2,8 -perm=0,1,3,4,2,5  $COMMON_ARGS
+
+echo "------------------------------------------------------------------"
+
+for prec in "fp8" "fp16" "fp32" ; do
+
+$EXE -prec=$prec -shape=3,8 -perm=1,0 $COMMON_ARGS
+$EXE -prec=$prec -shape=48,6,8 -perm=2,1,0  $COMMON_ARGS
+$EXE -prec=$prec -shape=24,128,3 -perm=0,2,1  $COMMON_ARGS
+$EXE -prec=$prec -shape=4,10,7,6 -perm=0,2,3,1  $COMMON_ARGS
+$EXE -prec=$prec -shape=8,24,36,10 -perm=3,1,2,0  $COMMON_ARGS
+$EXE -prec=$prec -shape=8,1,36,4 -perm=2,1,0,3  $COMMON_ARGS
+$EXE -prec=$prec -shape=5,10,16,2,36,4 -perm=4,5,2,1,0,3  $COMMON_ARGS
+$EXE -prec=$prec -shape=2,32,8,3,6,2,5,4 -perm=5,2,4,7,1,6,3,0  $COMMON_ARGS
+echo "------------------------------------------------------------------"
+done
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 366fb18a0..c85e31341 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -7,5 +7,6 @@ add_subdirectory(02_layernorm2d)
 add_subdirectory(03_gemm)
 add_subdirectory(04_img2col)
 add_subdirectory(05_reduce)
+add_subdirectory(06_permute)
 add_subdirectory(09_topk_softmax)
 
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index e17d7c22a..a17ce751c 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -22,6 +22,7 @@
 #include "ck_tile/host/reference/reference_gemm.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
+#include "ck_tile/host/reference/reference_permute.hpp"
 #include "ck_tile/host/reference/reference_reduce.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
 #include "ck_tile/host/reference/reference_topk.hpp"
diff --git a/include/ck_tile/host/reference/reference_permute.hpp b/include/ck_tile/host/reference/reference_permute.hpp
new file mode 100644
index 000000000..1c8248340
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_permute.hpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+#include <numeric>
+#include <functional>
+
+namespace ck_tile {
+
+/*
+    this will do permute + contiguous like functionality in pytorch
+*/
+template <typename DataType>
+CK_TILE_HOST void
+reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::vector<index_t> dims)
+{
+    const auto x_len = x.mDesc.get_lengths();
+    const auto y_len = y.mDesc.get_lengths();
+    assert(x_len.size() == y_len.size());
+    index_t rank     = x_len.size();
+    const auto x_elm = std::accumulate(x_len.begin(), x_len.end(), 1, std::multiplies<index_t>());
+    const auto y_elm = std::accumulate(y_len.begin(), y_len.end(), 1, std::multiplies<index_t>());
+    assert(x_elm == y_elm);
+    (void)y_elm;
+
+    auto f = [&](auto i_element) {
+        std::vector<size_t> y_coord = [&]() {
+            std::vector<size_t> tmp(rank, 0);
+            size_t r = i_element;
+            for(index_t i = rank - 1; i >= 0; i--)
+            {
+                tmp[i] = r % y_len[i];
+                r      = r / y_len[i];
+            }
+            return tmp;
+        }();
+
+        std::vector<size_t> x_coord = [&]() {
+            std::vector<size_t> tmp(rank, 0);
+            for(index_t i = 0; i < rank; i++)
+            {
+                tmp[dims[i]] = y_coord[i];
+            }
+            return tmp;
+        }();
+
+        // do permute
+        y(y_coord) = x(x_coord);
+    };
+
+    make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/permute.hpp b/include/ck_tile/ops/permute.hpp
new file mode 100644
index 000000000..ee8c69372
--- /dev/null
+++ b/include/ck_tile/ops/permute.hpp
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/permute/kernel/generic_permute_kernel.hpp"
+#include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp b/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
new file mode 100644
index 000000000..1c5cc4a11
--- /dev/null
+++ b/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+// #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp"
+
+namespace ck_tile {
+
+/* independent host side argument, no template
+ */
+struct GenericPermuteHostArgs
+{
+    static constexpr index_t kMaxRanks = 8; // TODO: hardcoded
+
+    const void* p_src;
+    void* p_dst;
+    index_t rank;
+    index_t shape[kMaxRanks]; // input shape
+    index_t perm[kMaxRanks];  // permute index
+};
+
+/*
+simulate torch.permute:
+x_ = x_.view(x.shape[0],
+                    x.shape[1]//16, 16,
+                    x.shape[2]//32, 4, 8)
+x_ = x_.permute(0,1,3,4,2,5)
+x_ = x_.contiguous()
+x_ = x_.view(x.shape[0], x.shape[1], x.shape[2]);//
+
+this kernel is supposed not to be performant(just OK), with functional support up to kMaxRanks
+dim of permutation, with a single kernel
+
+*/
+template <typename Problem_>
+struct GenericPermute
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+
+    using DataType                      = remove_cvref_t<typename Problem::DataType>;
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+    static constexpr index_t kMaxRanks  = Problem::kMaxRanks;
+    static constexpr bool KeepLastDim   = Problem::KeepLastDim;
+
+    struct __attribute__((packed)) Kargs
+    {
+        const void* p_src;
+        void* p_dst;
+        // index_t rank;
+        index_t num_elements;
+        index_t perm_length[kMaxRanks]; // tensor length after permutation
+        index_t perm_stride[kMaxRanks]; // tensor stride after permutation
+    };
+
+    CK_TILE_HOST static constexpr index_t TotalElements(const GenericPermuteHostArgs& h)
+    {
+        index_t n = 1;
+        for(auto i = 0; i < h.rank; i++)
+        {
+            n *= h.shape[i];
+        }
+        return n;
+    }
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const GenericPermuteHostArgs& h)
+    {
+        Kargs a;
+        a.p_src = h.p_src;
+        a.p_dst = h.p_dst;
+
+        // assert rank <= kMaxRanks
+        index_t i = 0;
+
+        index_t perm[kMaxRanks];
+        index_t x_shape[kMaxRanks];
+        index_t x_stride[kMaxRanks];
+        // index_t perm_length[kMaxRanks];
+
+        for(; i < h.rank; i++)
+        {
+            x_shape[i] = h.shape[i];
+            perm[i]    = h.perm[i];
+        }
+        for(; i < kMaxRanks; i++)
+        {
+            x_shape[i] = 1;
+            perm[i]    = i; // will index to len = 1
+        }
+
+        index_t stride = 1;
+        for(index_t j = kMaxRanks - 1; j >= 0; j--)
+        {
+            x_stride[j] = stride;
+            stride *= x_shape[j];
+        }
+
+        for(index_t j = 0; j < kMaxRanks; j++)
+        {
+            a.perm_length[j] = x_shape[perm[j]];
+            a.perm_stride[j] = x_stride[perm[j]];
+        }
+
+        a.num_elements = TotalElements(h);
+        return a;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(GenericPermuteHostArgs h)
+    {
+        auto total = TotalElements(h);
+        auto grids = dim3((total + BlockSize() - 1) / BlockSize());
+        //  printf("### total:%d, grids:%dx%dx%d\n", total, );
+        return grids;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::kBlockSize; }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        index_t id = blockIdx.x * BlockSize() + threadIdx.x;
+
+        if(id >= kargs.num_elements)
+            return;
+
+        const auto perm_length =
+            generate_tuple([&](auto I) { return kargs.perm_length[I]; }, number<kMaxRanks>{});
+        const auto perm_stride =
+            generate_tuple([&](auto I) { return kargs.perm_stride[I]; }, number<kMaxRanks>{});
+
+        const DataType* p_src = reinterpret_cast<const DataType*>(kargs.p_src);
+        DataType* p_dst       = reinterpret_cast<DataType*>(kargs.p_dst);
+
+        const auto src_view_0 = make_naive_tensor_view<address_space_enum::global>(
+            p_src, perm_length, perm_stride, number<1>{}, number<1>{});
+
+        const auto src_view = transform_tensor_view(
+            src_view_0,
+            make_tuple(make_merge_transform(perm_length)),
+            make_tuple(typename arithmetic_sequence_gen<0, kMaxRanks, 1>::type{}),
+            make_tuple(sequence<0>{}));
+
+        auto dst_view_0 = make_naive_tensor_view_packed<address_space_enum::global>(
+            p_dst, perm_length, number<1>{});
+
+        auto dst_view = transform_tensor_view(
+            dst_view_0,
+            make_tuple(make_merge_transform(perm_length)),
+            make_tuple(typename arithmetic_sequence_gen<0, kMaxRanks, 1>::type{}),
+            make_tuple(sequence<0>{}));
+
+        // TODO: hard code to vector 1
+        using vector_t = thread_buffer<DataType, 1>;
+
+        const auto src_coord =
+            make_tensor_coordinate(src_view.get_tensor_descriptor(), array<index_t, 1>{id});
+        const auto dst_coord =
+            make_tensor_coordinate(dst_view.get_tensor_descriptor(), array<index_t, 1>{id});
+
+        // printf("src id:%d, os:%d\n", id, src_coord.get_offset());
+        // printf("dst id:%d, os:%d\n", id, dst_coord.get_offset());
+
+        const vector_t x = src_view.template get_vectorized_elements<vector_t>(src_coord, 0);
+        dst_view.template set_vectorized_elements<vector_t>(dst_coord, 0, x);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
new file mode 100644
index 000000000..e504ed747
--- /dev/null
+++ b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+template <typename DataType_,
+          index_t kBlockSize_ = 256,
+          index_t kMaxRanks_  = 8,
+          bool KeepLastDim_   = false>
+struct GenericPermuteProblem
+{
+    using DataType                      = remove_cvref_t<DataType_>;
+    static constexpr index_t kBlockSize = kBlockSize_;
+    static constexpr index_t kMaxRanks  = kMaxRanks_;
+    /* KeepLastDim:
+     *  if last dim keep the same? this can help enable vector load
+     *   permute(0, 2, 4, 1, 3, 5) -> true
+     *   permute(0, 3, 2, 1) -> false
+     */
+    static constexpr bool KeepLastDim = KeepLastDim_;
+    // TODO: not used(?)
+};
+
+} // namespace ck_tile
-- 
GitLab


From 4d7e063a0a2dfb183bc3876b1ff021829aabd38b Mon Sep 17 00:00:00 2001
From: valarLip <103567126+valarLip@users.noreply.github.com>
Date: Tue, 29 Oct 2024 18:19:29 +0800
Subject: [PATCH 023/153] [CK_TILE] add scatter_gather (#1609)

---
 include/ck_tile/core.hpp                      |   1 +
 .../core/algorithm/coordinate_transform.hpp   | 104 +++++++
 .../core/algorithm/indexing_adaptor.hpp       |  60 ++++
 test/CMakeLists.txt                           |   1 +
 test/scatter_gather/CMakeLists.txt            |   2 +
 test/scatter_gather/scatter_gather.cpp        | 276 ++++++++++++++++++
 6 files changed, 444 insertions(+)
 create mode 100644 include/ck_tile/core/algorithm/indexing_adaptor.hpp
 create mode 100644 test/scatter_gather/CMakeLists.txt
 create mode 100644 test/scatter_gather/scatter_gather.cpp

diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 56dfbd636..14991d375 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/core/algorithm/cluster_descriptor.hpp"
 #include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/algorithm/indexing_adaptor.hpp"
 #include "ck_tile/core/algorithm/space_filling_curve.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/arch/arch.hpp"
diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp
index 5c7e48980..aaa7db257 100644
--- a/include/ck_tile/core/algorithm/coordinate_transform.hpp
+++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp
@@ -23,6 +23,7 @@ enum struct coord_transform_enum
     replicate,
     xor_t,
     offset,
+    indexing,
 };
 
 template <index_t NDimLow, index_t NDimUp>
@@ -1526,6 +1527,88 @@ struct offset : public base_transform<1, 1>
     }
 };
 
+template <typename UpLength, typename IndexingAdaptor>
+struct indexing : public base_transform<1, 1>
+{
+    static constexpr index_t NDimUp = 1;
+
+    using LowerIndex = multi_index<1>;
+    using UpperIndex = multi_index<1>;
+
+    using UpLengths = decltype(make_tuple(UpLength{}));
+    UpLengths up_lengths_;
+    IndexingAdaptor iadaptor_;
+
+    CK_TILE_HOST_DEVICE constexpr indexing() = default;
+
+    CK_TILE_HOST_DEVICE constexpr indexing(const UpLength& up_length,
+                                           const IndexingAdaptor& iadaptor)
+        : up_lengths_{make_tuple(up_length)}, iadaptor_{iadaptor}
+    {
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_type_enum()
+    {
+        return coord_transform_enum::indexing;
+    }
+
+    CK_TILE_HOST_DEVICE constexpr const auto& get_upper_lengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    CK_TILE_HOST_DEVICE constexpr void calculate_lower_index(LowIdx& idx_low,
+                                                             const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::size() == 1 && UpIdx::size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+        iadaptor_.calculate_lower_index(idx_low, idx_up);
+    }
+
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
+    CK_TILE_HOST_DEVICE void update_lower_index(LowIdxDiff& idx_diff_low,
+                                                const UpIdxDiff& idx_diff_up,
+                                                LowIdx& idx_low,
+                                                const UpIdx& idx_up) const
+    {
+        // TODO: nonthing changed here
+        static_assert(LowIdxDiff::size() == 1 && UpIdxDiff::size() == NDimUp &&
+                          LowIdx::size() == 1 && UpIdx::size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+
+        iadaptor_.update_lower_index(idx_diff_low, idx_diff_up, idx_low, idx_up);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr bool
+    is_valid_upper_index_always_mapped_to_valid_lower_index()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    CK_TILE_HOST_DEVICE static constexpr bool
+    is_valid_upper_index_mapped_to_valid_lower_index(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr bool is_known_at_compile_time()
+    {
+        return ck_tile::is_known_at_compile_time<UpLengths>::value &&
+               IndexingAdaptor::is_known_at_compile_time();
+    }
+
+    CK_TILE_HOST_DEVICE void print() const
+    {
+        printf("embed{");
+
+        //
+        printf("up_lengths_: ");
+        print(up_lengths_);
+        printf(", ");
+
+        printf("}");
+    }
+};
+
 //*******************************************************************************************************
 
 template <typename LowLength>
@@ -1646,3 +1729,24 @@ CK_TILE_HOST_DEVICE constexpr auto make_offset_transform(const LowLength& low_le
 }
 
 } // namespace ck_tile
+
+#include "ck_tile/core/algorithm/indexing_adaptor.hpp"
+namespace ck_tile {
+
+template <typename UpLength, typename Indices>
+CK_TILE_HOST_DEVICE constexpr auto make_indexing_transform(const UpLength& up_lengths,
+                                                           const Indices& indices)
+{
+    // by default we use the simplest one
+    return indexing<UpLength, indexing_adaptor_onshot_cached<remove_cvref_t<Indices>>>{
+        up_lengths, indexing_adaptor_onshot_cached<remove_cvref_t<Indices>>{indices}};
+}
+
+template <typename UpLength, typename IndexingAdaptor>
+CK_TILE_HOST_DEVICE constexpr auto
+make_indexing_transform_with_adaptor(const UpLength& up_lengths, const IndexingAdaptor& iadaptor)
+{
+    return indexing<UpLength, IndexingAdaptor>{up_lengths, iadaptor};
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/algorithm/indexing_adaptor.hpp b/include/ck_tile/core/algorithm/indexing_adaptor.hpp
new file mode 100644
index 000000000..ef59abdc9
--- /dev/null
+++ b/include/ck_tile/core/algorithm/indexing_adaptor.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/multi_index.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+//  pre-defined indexing adaptor used for indexing(scatter/gather)
+
+// this version cache the index inside thread register(which is also prefered in real senario)
+// however it's user's responsibility that each thread only provide one indexing, which means
+// move coordinate will not change on this dim
+template <typename IndexingType>
+struct indexing_adaptor_onshot_cached
+{
+
+    CK_TILE_HOST_DEVICE constexpr indexing_adaptor_onshot_cached() = default;
+    CK_TILE_HOST_DEVICE constexpr indexing_adaptor_onshot_cached(const IndexingType& idx)
+        : cached_idx_(idx)
+    {
+    }
+    IndexingType cached_idx_;
+
+    template <typename LowIdx, typename UpIdx>
+    CK_TILE_HOST_DEVICE constexpr void calculate_lower_index(LowIdx& idx_low,
+                                                             const UpIdx& /*idx_up*/) const
+    {
+        static_assert(LowIdx::size() == 1 && UpIdx::size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(number<0>{}) = cached_idx_;
+    }
+
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
+    CK_TILE_HOST_DEVICE void update_lower_index(LowIdxDiff& idx_diff_low,
+                                                const UpIdxDiff& idx_diff_up,
+                                                LowIdx& /*idx_low*/,
+                                                const UpIdx& /*idx_up*/) const
+    {
+        // TODO: nonthing changed here
+        static_assert(LowIdxDiff::size() == 1 && UpIdxDiff::size() == 1 && LowIdx::size() == 1 &&
+                          UpIdx::size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_diff_low(number<0>{}) = idx_diff_up[number<0>{}];
+
+        // pass the diff to lower, but not changing the actually index
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr bool is_known_at_compile_time()
+    {
+        return ck_tile::is_known_at_compile_time<IndexingType>::value;
+    }
+};
+} // namespace ck_tile
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b836dd687..b12ced524 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -210,3 +210,4 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL
     add_subdirectory(smfmac_op)
 endif()
 add_subdirectory(position_embedding)
+add_subdirectory(scatter_gather)
diff --git a/test/scatter_gather/CMakeLists.txt b/test/scatter_gather/CMakeLists.txt
new file mode 100644
index 000000000..cc327d42d
--- /dev/null
+++ b/test/scatter_gather/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_test_executable(test_scatter_gather scatter_gather.cpp)
+# target_compile_options(test_scatter_gather PRIVATE -v --save-temps -Wno-gnu-line-marker)
diff --git a/test/scatter_gather/scatter_gather.cpp b/test/scatter_gather/scatter_gather.cpp
new file mode 100644
index 000000000..439e792dd
--- /dev/null
+++ b/test/scatter_gather/scatter_gather.cpp
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+
+#ifndef TEST_SCATTER_GATHER_VERBOSE
+#define TEST_SCATTER_GATHER_VERBOSE 1
+#endif
+
+#define HIP_CALL(call)                                                              \
+    do                                                                              \
+    {                                                                               \
+        hipError_t err = call;                                                      \
+        if(err != hipSuccess)                                                       \
+        {                                                                           \
+            printf("[hiperror](%d) fail to call %s", static_cast<int>(err), #call); \
+            exit(0);                                                                \
+        }                                                                           \
+    } while(0)
+
+/*
+TODO:
+This is a simple design of scatter/gather through indexing transform, with limitations
+We may design a scatter/gather adaptor layer directly inside tile window
+*/
+template <ck_tile::index_t ROW_TILE_SIZE = 8,
+          ck_tile::index_t COL_TILE_SIZE = 32 * 8,
+          ck_tile::index_t BLOCK_SIZE    = 256,
+          ck_tile::index_t ALIGNMENT     = 8,
+          typename INDEX_BUF_TYPE        = ck_tile::index_t,
+          typename DATA_TYPE             = ck_tile::fp16_t>
+__global__ void row_scatter_gather(const INDEX_BUF_TYPE* src_row_idx_ptr,
+                                   const INDEX_BUF_TYPE* dst_row_idx_ptr,
+                                   const DATA_TYPE* src_ptr,
+                                   DATA_TYPE* dst_ptr,
+                                   ck_tile::index_t n_row_total,
+                                   ck_tile::index_t /*n_row_select*/,
+                                   ck_tile::index_t n_cols)
+{
+    using namespace ck_tile;
+
+    // some constexpr vars
+    constexpr index_t vec = ALIGNMENT;
+    static_assert(COL_TILE_SIZE % vec == 0);
+    constexpr index_t col_lanes = COL_TILE_SIZE / vec;
+    constexpr index_t warp_size = ck_tile::get_warp_size();
+    static_assert(warp_size % col_lanes == 0);
+    constexpr index_t row_lanes = warp_size / col_lanes;
+    constexpr index_t num_warps = BLOCK_SIZE / warp_size;
+    static_assert(ROW_TILE_SIZE % (num_warps * row_lanes) == 0);
+    constexpr index_t row_repeat = ROW_TILE_SIZE / (num_warps * row_lanes);
+    static_assert(
+        row_repeat == 1,
+        "currently indexing not support(and would be not performant) if row_repeat has more");
+
+    // tile partitioner
+    index_t tile_col_idx = 0;
+    index_t tile_row_idx = blockIdx.x * ROW_TILE_SIZE;
+
+    // create our tild distribution, which tell us the location of different threads
+    constexpr auto src_dist = make_static_tile_distribution(
+        tile_distribution_encoding<
+            sequence<1>,
+            tuple<sequence<row_repeat, num_warps, row_lanes>, sequence<col_lanes, vec>>,
+            tuple<sequence<1>, sequence<1, 2>>,
+            tuple<sequence<1>, sequence<2, 0>>,
+            sequence<1, 2>,
+            sequence<0, 1>>{});
+    const auto coord     = src_dist.calculate_index();
+    const auto row_coord = coord[number<0>{}] + tile_row_idx;
+
+    // load the current row index from the indexing buffer. we do not use ck_tile utility here
+    INDEX_BUF_TYPE src_row_id = src_row_idx_ptr[row_coord];
+    INDEX_BUF_TYPE dst_row_id = dst_row_idx_ptr[row_coord];
+
+    // printf("-- tid:%d, src_row_id:%d, dst_row_id:%d\n", static_cast<int>(threadIdx.x),
+    // static_cast<int>(src_row_id), static_cast<int>(dst_row_id));
+
+    const auto src_view =
+        make_naive_tensor_view<address_space_enum::global>(src_ptr,
+                                                           make_tuple(n_row_total, n_cols),
+                                                           make_tuple(n_cols, 1),
+                                                           number<vec>{}, // alignement
+                                                           number<1>{});
+
+    const auto src_gather_view = transform_tensor_view(
+        src_view,
+        make_tuple(make_indexing_transform(
+                       n_row_total,
+                       src_row_id), // here we replace row_idx  which is loaded from another buffer
+                   make_pass_through_transform(n_cols)),
+        make_tuple(sequence<0>{}, sequence<1>{}),
+        make_tuple(sequence<0>{}, sequence<1>{}));
+
+    auto src_tile = make_tile_window(src_gather_view,
+                                     make_tuple(number<ROW_TILE_SIZE>{}, number<COL_TILE_SIZE>{}),
+                                     {tile_row_idx, tile_col_idx},
+                                     src_dist);
+
+    const auto dst_view =
+        make_naive_tensor_view<address_space_enum::global>(dst_ptr,
+                                                           make_tuple(n_row_total, n_cols),
+                                                           make_tuple(n_cols, 1),
+                                                           number<vec>{},
+                                                           number<1>{});
+
+    const auto dst_scatter_view = transform_tensor_view(
+        dst_view,
+        make_tuple(make_indexing_transform(
+                       n_row_total,
+                       dst_row_id), // here we replace row_idx  which is loaded from another buffer
+                   make_pass_through_transform(n_cols)),
+        make_tuple(sequence<0>{}, sequence<1>{}),
+        make_tuple(sequence<0>{}, sequence<1>{}));
+
+    auto dst_tile = make_tile_window(dst_scatter_view,
+                                     make_tuple(number<ROW_TILE_SIZE>{}, number<COL_TILE_SIZE>{}),
+                                     {tile_row_idx, tile_col_idx},
+                                     src_dist /*reuse distribution*/);
+
+    // we finished descriptor construction and index calculation, now start load/store
+    for(auto i = 0; i < n_cols; i += COL_TILE_SIZE)
+    {
+        // note that scatter/gather are just the same API when doing load store as normal memory
+        // operation
+        auto data = load_tile(src_tile);
+        store_tile(dst_tile, data);
+
+        move_tile_window(src_tile, {number<0>{}, number<COL_TILE_SIZE>{}});
+        move_tile_window(dst_tile, {number<0>{}, number<COL_TILE_SIZE>{}});
+    }
+}
+
+union pixel
+{
+    struct __attribute__((packed))
+    {
+        unsigned int r : 6;
+        unsigned int c : 10;
+    };
+    ushort data;
+};
+
+struct unique_linear_rand
+{
+    unique_linear_rand(int capacity_) : capacity(capacity_) {}
+    std::unordered_set<int> set;
+    int gen()
+    {
+        if(static_cast<int>(set.size()) >= capacity)
+        {
+            printf("overflow, but will give you an number as well\n");
+            return std::rand() % capacity;
+        }
+        while(1)
+        {
+            int r = std::rand() % capacity;
+            if(set.count(r) == 1)
+            {
+                continue;
+            }
+            set.insert(r);
+            return r;
+        }
+    }
+
+    int capacity;
+};
+
+int main()
+{
+    int row_total  = 64;
+    int row_select = 8 * 2;
+    int col        = 256 * 2;
+    using fp16_t   = ck_tile::fp16_t;
+
+    constexpr int row_tile = 8;
+    constexpr int col_tile = 256;
+
+    fp16_t* src = reinterpret_cast<fp16_t*>(malloc(row_total * col * sizeof(fp16_t)));
+    for(int i_r = 0; i_r < row_total; i_r++)
+    {
+        for(int i_c = 0; i_c < col; i_c++)
+        {
+            int i = i_r * col + i_c;
+            pixel p;
+            p.r      = i_r;
+            p.c      = i_c;
+            ushort d = p.data;
+            src[i]   = ck_tile::bit_cast<fp16_t>(d); // for simplicity, just cast
+        }
+    }
+
+    fp16_t* dst  = reinterpret_cast<fp16_t*>(malloc(row_total * col * sizeof(fp16_t)));
+    int* src_idx = reinterpret_cast<int*>(malloc(row_select * sizeof(int)));
+    int* dst_idx = reinterpret_cast<int*>(malloc(row_select * sizeof(int)));
+    // std::srand(std::time(std::nullptr));
+    // std::srand(11935);
+    std::srand(std::time(nullptr));
+    auto src_gen = unique_linear_rand(row_total);
+    auto dst_gen = unique_linear_rand(row_total); // dst index must be unique. src is fine
+    for(int i_r = 0; i_r < row_select; i_r++)
+    {
+        src_idx[i_r] = src_gen.gen();
+        dst_idx[i_r] = dst_gen.gen();
+    }
+
+    void* dev_src;
+    void* dev_dst;
+    void* dev_src_idx;
+    void* dev_dst_idx;
+    HIP_CALL(hipMalloc(&dev_src, row_total * col * sizeof(fp16_t)));
+    HIP_CALL(hipMalloc(&dev_dst, row_total * col * sizeof(fp16_t)));
+    HIP_CALL(hipMalloc(&dev_src_idx, row_select * sizeof(int)));
+    HIP_CALL(hipMalloc(&dev_dst_idx, row_select * sizeof(int)));
+
+    HIP_CALL(hipMemcpy(dev_src, src, row_total * col * sizeof(fp16_t), hipMemcpyHostToDevice));
+    HIP_CALL(hipMemcpy(dev_src_idx, src_idx, row_select * sizeof(int), hipMemcpyHostToDevice));
+    HIP_CALL(hipMemcpy(dev_dst_idx, dst_idx, row_select * sizeof(int), hipMemcpyHostToDevice));
+
+    constexpr int bdim = 256;
+    int gdim           = (row_select + row_tile - 1) / row_tile;
+    row_scatter_gather<row_tile, col_tile><<<gdim, bdim>>>(reinterpret_cast<int*>(dev_src_idx),
+                                                           reinterpret_cast<int*>(dev_dst_idx),
+                                                           reinterpret_cast<fp16_t*>(dev_src),
+                                                           reinterpret_cast<fp16_t*>(dev_dst),
+                                                           row_total,
+                                                           row_select,
+                                                           col);
+
+    HIP_CALL(hipMemcpy(dst, dev_dst, row_total * col * sizeof(fp16_t), hipMemcpyDeviceToHost));
+
+#if TEST_SCATTER_GATHER_VERBOSE
+    printf("select row:");
+    for(int i_r = 0; i_r < row_select; i_r++)
+    {
+        printf("%d->%d->%d ", i_r, src_idx[i_r], dst_idx[i_r]);
+    }
+    printf("\n");
+#endif
+
+    int err_cnt = 0;
+    for(int i_r = 0; i_r < row_select; i_r++)
+    {
+        for(int i_c = 0; i_c < col; i_c++)
+        {
+            int i      = dst_idx[i_r] * col + i_c;
+            pixel p    = ck_tile::bit_cast<pixel>(dst[i]);
+            bool is_ok = p.r == src_idx[i_r] && p.c == i_c;
+            if(!is_ok)
+            {
+                if(i_c == 0)
+                    printf("(%d)pixel: %dx%d -> %d\n", i_r, p.r, p.c, dst_idx[i_r]);
+                err_cnt++;
+            }
+        }
+    }
+#if TEST_SCATTER_GATHER_VERBOSE
+    printf("err:%d\n", err_cnt);
+#endif
+
+    free(src);
+    free(dst);
+    free(src_idx);
+    free(dst_idx);
+    return err_cnt == 0 ? 0 : -1;
+}
-- 
GitLab


From 863222181477ff42e809d034428f9160490a63ba Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Wed, 30 Oct 2024 14:03:16 +0800
Subject: [PATCH 024/153] [CK_TILE] Add fmha fwd headdim96 support (#1608)

* Add ceil_to_qualified_tile_length()

* Rename kK0BlockLength to kQKHeaddim

* Add kSubQKHeaddim concept to support headdim96

* Fix in math.hpp to avoid using __half interfaces

* Add LdsBufferSequence instance for headdim96

* Update in fmha_fwd/fmha_fwd_splitkv codegen to support hd96 testing

* Disable hd96 instance generation in codegen fmha_fwd and fmha_fwd_splitkv to save compiling time

* Reformat one file

* Fix text alignment in fmha_fwd_splitkv.py

---------

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 41 +++++++++++-------
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   | 42 ++++++++++++-------
 include/ck_tile/core/numeric/math.hpp         | 12 +++---
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |  8 ++--
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |  8 ++--
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 23 +++++-----
 .../pipeline/block_fmha_pipeline_qr_ks_vs.hpp | 23 +++++-----
 .../block_fmha_pipeline_qr_ks_vs_async.hpp    | 23 +++++-----
 .../block_fmha_pipeline_qr_ks_vs_fp8.hpp      | 22 +++++-----
 .../pipeline/block_fmha_pipeline_qs_ks_vs.hpp | 23 +++++-----
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 15 ++++---
 .../ops/fmha/pipeline/tile_fmha_shape.hpp     | 20 ++++++++-
 12 files changed, 153 insertions(+), 107 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 805803fed..e5ee1d22e 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -21,6 +21,14 @@ DTYPE_BITS = {
     "bf8" : 8
 }
 
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
+
 TILE_PARTITIONER_MAP = {
     "shb" : "ck_tile::FmhaFwdTilePartitioner_SHB",
     "hbs" : "ck_tile::FmhaFwdTilePartitioner_HBS",
@@ -35,7 +43,7 @@ FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 FMHA_FWD_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
 
-using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>;
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
 using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 
 using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
@@ -87,7 +95,7 @@ using fmha_kernel_{F_idx} =
                   fmha_pipeline_{F_idx},
                   fmha_epilogue_{F_idx}>;
 
-using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout},
+using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
                         {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
 
 #include <iostream>
@@ -125,7 +133,7 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
@@ -142,7 +150,7 @@ class FmhaFwdApiTrait:
     bk0       : int  # tile size along qk gemm unroll
     bn1       : int  # tile size along v head_dim
     bk1       : int  # tile size along kv gemm unroll
-    bk0blen   : int
+    bk0max    : int
     vlayout   : str
     mask      : str
     bias      : str  #
@@ -156,7 +164,7 @@ class FmhaFwdApiTrait:
 
     @property
     def name(self) -> str:
-        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0blen}-'+\
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
                     f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
 
     @property
@@ -188,8 +196,9 @@ class FmhaFwdApiTrait:
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
         elif self.pipeline_tag in ['qr']:
-            if self.dpad == 't': return f'true /*a.hdim_q % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
-            else :               return f'a.hdim_q % {self.bk0blen} == 0'
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
         else:   assert False
 
     @property
@@ -199,8 +208,9 @@ class FmhaFwdApiTrait:
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
         elif self.pipeline_tag in ['qr']:
-            if self.dvpad == 't': return f'true /*a.hdim_v % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
-            else :                return f'a.hdim_v % {self.bk0blen} == 0'
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
         else:   assert False
 
 @dataclass
@@ -271,7 +281,7 @@ class FmhaFwdApiPool:
                                    F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] ,
                                    F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                    F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0blen=trait.bk0blen,
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                    F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
                 per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
@@ -289,7 +299,7 @@ class FmhaFwdTileSize:
     F_bk0       : int  # tile size along qk gemm unroll
     F_bn1       : int  # tile size along v head_dim
     F_bk1       : int  # tile size along kv gemm unroll
-    F_bk0blen   : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
     F_rm0       : int  # number of warps for gemm0 along q seqlen
     F_rn0       : int  # number of warps for gemm0 along k seqlen 
     F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
@@ -302,7 +312,7 @@ class FmhaFwdTileSize:
     F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
     @property
     def name(self) -> str:
-        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0blen}" +\
+        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
         f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
         f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
 
@@ -335,7 +345,7 @@ class FmhaFwdKernel:
                 F_bk0           = self.F_tile.F_bk0,
                 F_bn1           = self.F_tile.F_bn1,
                 F_bk1           = self.F_tile.F_bk1,
-                F_bk0blen       = self.F_tile.F_bk0blen,
+                F_bk0max        = self.F_tile.F_bk0max,
                 F_rm0           = self.F_tile.F_rm0,
                 F_rn0           = self.F_tile.F_rn0,
                 F_rk0           = self.F_tile.F_rk0,
@@ -382,7 +392,7 @@ class FmhaFwdKernel:
                 bk0=self.F_tile.F_bk0,
                 bn1=self.F_tile.F_bn1,
                 bk1=self.F_tile.F_bk1,
-                bk0blen=self.F_tile.F_bk0blen,
+                bk0max=self.F_tile.F_bk0max,
                 vlayout=self.F_pipeline.F_vlayout,
                 mask=self.F_pipeline.F_mask,
                 bias=self.F_pipeline.F_bias,
@@ -401,6 +411,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
         return {
             '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1,  2, 1, 1,  32, 32, 16, -1),
             '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            ## '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32, 96,   4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
             '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
             '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
         }
@@ -510,4 +521,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
         _, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
\ No newline at end of file
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 46c26b22c..b084e9d0f 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -29,6 +29,14 @@ DTYPE_BITS = {
     "bf8" : 8
 }
 
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
+
 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
     "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
@@ -41,7 +49,7 @@ using fmha_mask_{F_idx} = {F_mask};
 namespace {{
 template <bool kHasUnevenSplits>
 struct kernel_runner {{
-using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>;
+using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
 using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 
 using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
@@ -103,7 +111,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }};
 }}
 
-using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout},
+using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
                         {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, 
                         {F_dvpad}>;
 
@@ -241,7 +249,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 
 FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
                 using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
 
                 return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
@@ -260,7 +268,7 @@ class FmhaFwdSplitKVApiTrait:
     bk0       : int  # tile size along qk gemm unroll
     bn1       : int  # tile size along v head_dim
     bk1       : int  # tile size along kv gemm unroll
-    bk0blen   : int
+    bk0max    : int
     vlayout   : str
     mask      : str
     bias      : str  #
@@ -270,11 +278,11 @@ class FmhaFwdSplitKVApiTrait:
     skpad     : str
     dpad      : str
     dvpad     : str
-    pagedkv : str
+    pagedkv   : str
 
     @property
     def name(self) -> str:
-        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0blen}-'+\
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
                     f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-'+\
                     f'{self.dvpad}-{self.pagedkv}'
 
@@ -307,8 +315,9 @@ class FmhaFwdSplitKVApiTrait:
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
         elif self.pipeline_tag in ['qr']:
-            if self.dpad == 't': return f'true /*a.hdim_q % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
-            else :               return f'a.hdim_q % {self.bk0blen} == 0'
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
         else:   assert False
 
     @property
@@ -318,8 +327,9 @@ class FmhaFwdSplitKVApiTrait:
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
         elif self.pipeline_tag in ['qr']:
-            if self.dvpad == 't': return f'true /*a.hdim_v % {self.bk0blen} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
-            else :                return f'a.hdim_v % {self.bk0blen} == 0'
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
         else:   assert False
 
 @dataclass
@@ -414,7 +424,7 @@ class FmhaFwdSplitKVApiPool:
                                    F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], 
                                    F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                    F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0blen=trait.bk0blen,
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                    F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
                 per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
@@ -458,7 +468,7 @@ class FmhaFwdSplitKVKernel:
                 F_bk0           = self.F_tile.F_bk0,
                 F_bn1           = self.F_tile.F_bn1,
                 F_bk1           = self.F_tile.F_bk1,
-                F_bk0blen       = self.F_tile.F_bk0blen,
+                F_bk0max        = self.F_tile.F_bk0max,
                 F_rm0           = self.F_tile.F_rm0,
                 F_rn0           = self.F_tile.F_rn0,
                 F_rk0           = self.F_tile.F_rk0,
@@ -504,7 +514,7 @@ class FmhaFwdSplitKVKernel:
                 bk0=self.F_tile.F_bk0,
                 bn1=self.F_tile.F_bn1,
                 bk1=self.F_tile.F_bk1,
-                bk0blen=self.F_tile.F_bk0blen,
+                bk0max=self.F_tile.F_bk0max,
                 vlayout=self.F_pipeline.F_vlayout,
                 mask=self.F_pipeline.F_mask,
                 bias=self.F_pipeline.F_bias,
@@ -559,6 +569,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
         return {
             '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16, -1),
             '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            ## '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
             '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
             '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
         }
@@ -576,6 +587,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
         return {
             '32'  : FmhaFwdSplitKVCombineTileSize(16, 16,  -1),
             '64'  : FmhaFwdSplitKVCombineTileSize(32, 32,  -1),
+            ## '96' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
             '128' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
             '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1),
     }
@@ -604,7 +616,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
         if dtype in ['fp16', 'bf16']:
             for mask, bias, lse, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
                 # TODO: use async pipeline when compiler is more stable 
-                if hdim == 256 or hdim in [32, 64, 128]:
+                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
                 # if True:
                     pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
                     pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
@@ -743,4 +755,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
         _, kernels = get_fwd_splitkv_blobs(kernel_filter, receipt, mask_impl)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n")
\ No newline at end of file
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n")
diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp
index 785691b66..0faf1aa04 100644
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
@@ -1126,7 +1126,7 @@ CK_TILE_DEVICE int8_t neg<int8_t>(int8_t x)
 template <>
 CK_TILE_DEVICE fp16_t neg<fp16_t>(fp16_t x)
 {
-    return __hneg(x);
+    return -x;
 };
 
 template <typename T>
@@ -1168,7 +1168,7 @@ CK_TILE_DEVICE double sin<double>(double x)
 template <>
 CK_TILE_DEVICE fp16_t sin<fp16_t>(fp16_t x)
 {
-    return ::hsin(x);
+    return __ocml_sin_f16(x);
 };
 
 template <typename T>
@@ -1300,7 +1300,7 @@ CK_TILE_DEVICE double ceil<double>(double x)
 template <>
 CK_TILE_DEVICE fp16_t ceil<fp16_t>(fp16_t x)
 {
-    return ::hceil(x);
+    return __ocml_ceil_f16(x);
 };
 
 template <typename T>
@@ -1342,7 +1342,7 @@ CK_TILE_DEVICE double floor<double>(double x)
 template <>
 CK_TILE_DEVICE fp16_t floor<fp16_t>(fp16_t x)
 {
-    return ::hfloor(x);
+    return __ocml_floor_f16(x);
 };
 
 template <typename T>
@@ -1365,7 +1365,7 @@ CK_TILE_DEVICE T exp(T x)
 template <>
 CK_TILE_DEVICE fp16_t exp<fp16_t>(fp16_t x)
 {
-    return hexp(x);
+    return __ocml_exp_f16(x);
 };
 
 template <>
@@ -1389,7 +1389,7 @@ CK_TILE_DEVICE T log(T x)
 template <>
 CK_TILE_DEVICE fp16_t log<fp16_t>(fp16_t x)
 {
-    return hlog(x);
+    return __ocml_log_f16(x);
 };
 
 template <>
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 8c1f6c805..e0c145fde 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -82,10 +82,10 @@ struct FmhaFwdKernel
             if (kPadHeadDimV) n += "dv";
             return n.empty() ? n : std::string("p") + n; }();
         return
-            _SS_("fmha_fwd_d") + _TS_(bfs::kK0BlockLength) + "_" + _SS_(t2s<QDataType>::name) +
+            _SS_("fmha_fwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
             "_" + (kIsGroupMode ? "group" : "batch") + "_" + _SS_(TilePartitioner::name) + "_"
             "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
-                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" +
+                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
             "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
             "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
@@ -657,7 +657,7 @@ struct FmhaFwdKernel
             {
                 return pad_tensor_view(
                     q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0BlockLength>{}),
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
                     sequence<kPadSeqLenQ, kPadHeadDimQ>{});
             }
             else
@@ -724,7 +724,7 @@ struct FmhaFwdKernel
             [&]() {
                 if constexpr(FmhaPipeline::kQLoadOnce)
                     return make_tuple(number<FmhaPipeline::kM0>{},
-                                      number<FmhaPipeline::kK0BlockLength>{});
+                                      number<FmhaPipeline::kSubQKHeaddim>{});
                 else
                     return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
             }(),
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index ea30025b5..4ffebc3c9 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -78,10 +78,10 @@ struct FmhaFwdSplitKVKernel
             if (kPadHeadDimV) n += "dv";
             return n.empty() ? n : std::string("p") + n; }();
         return
-            _SS_("fmha_fwd_splitkv_d") + _TS_(bfs::kK0BlockLength) + "_" + _SS_(t2s<QDataType>::name) +
+            _SS_("fmha_fwd_splitkv_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
             "_" + (kIsGroupMode ? "group" : "batch") + "_"
             "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
-                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" +
+                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
             "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
             "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
@@ -586,7 +586,7 @@ struct FmhaFwdSplitKVKernel
             {
                 return pad_tensor_view(
                     q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0BlockLength>{}),
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
                     sequence<kPadSeqLenQ, kPadHeadDimQ>{});
             }
             else
@@ -735,7 +735,7 @@ struct FmhaFwdSplitKVKernel
             [&]() {
                 if constexpr(FmhaPipeline::kQLoadOnce)
                     return make_tuple(number<FmhaPipeline::kM0>{},
-                                      number<FmhaPipeline::kK0BlockLength>{});
+                                      number<FmhaPipeline::kSubQKHeaddim>{});
                 else
                     return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
             }(),
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 6e7416ce8..71c3bd171 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -34,12 +34,13 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
 
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
     static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
@@ -75,22 +76,22 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             return Problem::kBlockPerCu;
         else
         {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
             {
                 return 2;
             }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
             {
                 return 3;
             }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
             {
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 1;
                 else
                     return 2;
             }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
             {
                 return 1;
             }
@@ -270,7 +271,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
 
         // prefetch K tile
         index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
         constexpr index_t k1_loops = kN0 / kK1;
 
         static_assert(2 <= k0_loops);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
index 6837ffdee..a7e928714 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -37,12 +37,13 @@ struct BlockFmhaPipelineQRKSVS
 
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
     static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
     static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
@@ -76,22 +77,22 @@ struct BlockFmhaPipelineQRKSVS
             return Problem::kBlockPerCu;
         else
         {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
             {
                 return 2;
             }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
             {
                 return 3;
             }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
             {
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 1;
                 else
                     return 2;
             }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
             {
                 return 1;
             }
@@ -261,7 +262,7 @@ struct BlockFmhaPipelineQRKSVS
 
         // prefetch K tile
         index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
         constexpr index_t k1_loops = kN0 / kK1;
 
         static_assert(2 <= k0_loops);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index 05d3dae1c..10bb01168 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -38,12 +38,13 @@ struct BlockFmhaPipelineQRKSVSAsync
 
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
     static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
     // TODO: seq_q always support padding, hdim_q/v support multiple of vector(like 8x)
@@ -87,7 +88,7 @@ struct BlockFmhaPipelineQRKSVSAsync
                 return 1;
             }
 
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
             {
                 if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS &&
                              FmhaMask::IsMasking)
@@ -95,21 +96,21 @@ struct BlockFmhaPipelineQRKSVSAsync
                 else
                     return 2;
             }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
             {
                 if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 2;
                 else
                     return 3;
             }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
             {
                 if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 1;
                 else
                     return 2;
             }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
             {
                 return 1;
             }
@@ -339,7 +340,7 @@ struct BlockFmhaPipelineQRKSVSAsync
         // auto q_tile = q;      // tile_elementwise_in(q_element_func, q);
 
         index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
         constexpr index_t k1_loops = kN0 / kK1;
 
         static_assert(1 <= k0_loops);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
index f4767de0e..a1b1e0e15 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
@@ -36,12 +36,12 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
 
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0        = BlockFmhaShape::kM0;
+    static constexpr index_t kN0        = BlockFmhaShape::kN0;
+    static constexpr index_t kK0        = BlockFmhaShape::kK0;
+    static constexpr index_t kN1        = BlockFmhaShape::kN1;
+    static constexpr index_t kK1        = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim;
 
     static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
     static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
@@ -75,22 +75,22 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
             return Problem::kBlockPerCu;
         else
         {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
             {
                 return 2;
             }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
             {
                 return 3;
             }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
             {
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 1;
                 else
                     return 2;
             }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
             {
                 return 1;
             }
@@ -232,7 +232,7 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
 
         // prefetch K tile
         index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
         constexpr index_t k1_loops = kN0 / kK1;
 
         static_assert(2 <= k0_loops);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
index d08a8d489..b98247df9 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
@@ -36,12 +36,13 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
 
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
     static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
     static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
@@ -56,22 +57,22 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
             return Problem::kBlockPerCu;
         else
         {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
             {
                 return 2;
             }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
             {
                 return 3;
             }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
             {
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 1;
                 else
                     return 2;
             }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
             {
                 return 1;
             }
@@ -235,7 +236,7 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
 
         // prefetch K tile
         index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
         constexpr index_t k1_loops = kN0 / kK1;
 
         static_assert(2 <= k0_loops);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 807ad6548..fbb05e164 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -55,7 +55,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
         constexpr index_t MWarp = config.template at<1>();
 
         constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0BlockLength;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
 
         constexpr index_t K2 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
         constexpr index_t K1 = WG::WarpGemmAttribute::Impl::kABKLane;
@@ -323,6 +323,9 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     template<> struct
     LdsBufferSequence<3, 3, 3, 3> { using type = sequence<1, 2, 0,      1, 2, 0>; };
 
+    template<> struct
+    LdsBufferSequence<3, 3, 3, 4> { using type = sequence<1, 2, 0,      0, 1, 2, 0>; };
+
     template<> struct
     LdsBufferSequence<3, 3, 2, 2> { using type = sequence<1, 2,         1, 0>;};
     // clang-format on
@@ -332,12 +335,12 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     {
         using BlockFmhaShape = remove_cvref_t<typename Problem::BlockFmhaShape>;
 
-        constexpr index_t kN0            = BlockFmhaShape::kN0;
-        constexpr index_t kK0            = BlockFmhaShape::kK0;
-        constexpr index_t kK1            = BlockFmhaShape::kK1;
-        constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+        constexpr index_t kN0        = BlockFmhaShape::kN0;
+        constexpr index_t kK0        = BlockFmhaShape::kK0;
+        constexpr index_t kK1        = BlockFmhaShape::kK1;
+        constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim;
 
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
         constexpr index_t k1_loops = kN0 / kK1;
 
         return typename LdsBufferSequence<NumPrefetchK, NumPrefetchV, k0_loops, k1_loops>::type{};
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index f2bb2200f..570754b22 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -7,6 +7,20 @@
 
 namespace ck_tile {
 
+static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length(index_t len)
+{
+    if(len == 96)
+        return 128;
+    if(len == 160)
+        return 256;
+
+    // only length of 96, 160 and power-of-two is supported
+    if(!(len & (len - 1)))
+        return len;
+
+    return 0;
+};
+
 template <typename BlockTile_, // sequence<...
           typename Gemm0BlockWarps_,
           typename Gemm0WarpTile_,
@@ -36,10 +50,12 @@ struct TileFmhaShape
     static constexpr index_t kK0 = BlockTile::at(number<2>{}); // tile size along qk gemm unroll
     static constexpr index_t kN1 = BlockTile::at(number<3>{}); // tile size along v head_dim
     static constexpr index_t kK1 = BlockTile::at(number<4>{}); // tile size along kv gemm unroll
-    static constexpr index_t kK0BlockLength =
+    static constexpr index_t kQKHeaddim =
         BlockTile::at(number<5>{}); // total length of K0, used for pipeline that need load Q at
                                     // once (or repeately load Q as a whole tile)
-    static_assert(kK0BlockLength % kK0 == 0, "kK0BlockLength should be divisible by kK0");
+    static_assert(kQKHeaddim % kK0 == 0, "kQKHeaddim should be divisible by kK0");
+
+    static constexpr index_t kSubQKHeaddim = ceil_to_qualified_tile_length(kQKHeaddim);
 
     // v, rowmajor : seqlen*hdim, colmajor : hdim*seqlen
     static constexpr bool IsVLayoutRowMajor = IsVLayoutRowMajor_;
-- 
GitLab


From 3d60953477bd575e320c84240a9f8ef49eb7bedd Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Wed, 30 Oct 2024 15:22:56 +0800
Subject: [PATCH 025/153] [Ck tile] support rmsnorm and related fusion (#1605)

* Add reduce2d new api

* Prevent user use cross warp reduction

* Fix bug of std caculation

* Add rmsnorm2d

* Add rmsnorm small example

* Remove static assert to prevent compile fail

* Add script to test performance and correctness

* Add missing cmake change

* refine naming

* refine example of rmsnorm

* Fix bug of rmsnorm

* Refine naming

* Fix cmake

* clang format

* Refine pipeline name

* Add add_rmsnorm2d_rdquant kernel

* Add reduce op

* host verification

* Fix bug of one pass pipeline

* Refine tile size

* Add two pass pipeline

* Rename two pass to three pass

* Fix bug of kSaveX == false

* Add instance library

* Add test script

* Fix bug of x verification

* Add save_x to trait

* Add README

* Move reduce2d into reduce folder

* Fix bug of welford when number of m warp > 1

* remove reduncant comment

* 1. move 06_rmsnorm2d to 10_rmsnorm2d
2. move 07_add_rmsnorm2d_rdquant to 11_add_rmsnorm2d_rdquant

* clang format and add missing header

* Add host validation of add + layernorm2d + rsquant

* Revert "Add host validation of add + layernorm2d + rsquant"

This reverts commit 936cb457978b928b90eff89a08fcdb7dc8bbed67.

* Remove deprecated flag
---
 example/ck_tile/05_reduce/reduce.cpp          |  65 ++--
 example/ck_tile/05_reduce/reduce.hpp          | 172 +++++++----
 example/ck_tile/10_rmsnorm2d/CMakeLists.txt   |  25 ++
 example/ck_tile/10_rmsnorm2d/README.md        |  22 ++
 .../10_rmsnorm2d/example_rmsnorm2d_fwd.cpp    | 165 +++++++++++
 .../instances/rmsnorm2d_fwd_api.cpp           | 153 ++++++++++
 .../rmsnorm2d_fwd_bf16_n1024_instance.cpp     |  22 ++
 .../rmsnorm2d_fwd_bf16_n1536_instance.cpp     |  13 +
 .../rmsnorm2d_fwd_bf16_n2048_instance.cpp     |  14 +
 .../rmsnorm2d_fwd_bf16_n256_instance.cpp      |  12 +
 .../rmsnorm2d_fwd_bf16_n3072_instance.cpp     |  14 +
 .../rmsnorm2d_fwd_bf16_n4096_instance.cpp     |  14 +
 .../rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp  |  14 +
 .../rmsnorm2d_fwd_bf16_n512_instance.cpp      |  13 +
 .../rmsnorm2d_fwd_bf16_n64_n128_instance.cpp  |  12 +
 .../rmsnorm2d_fwd_bf16_n768_instance.cpp      |  12 +
 .../rmsnorm2d_fwd_fp16_n1024_instance.cpp     |  22 ++
 .../rmsnorm2d_fwd_fp16_n1536_instance.cpp     |  13 +
 .../rmsnorm2d_fwd_fp16_n2048_instance.cpp     |  14 +
 .../rmsnorm2d_fwd_fp16_n256_instance.cpp      |  12 +
 .../rmsnorm2d_fwd_fp16_n3072_instance.cpp     |  14 +
 .../rmsnorm2d_fwd_fp16_n4096_instance.cpp     |  14 +
 .../rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp  |  14 +
 .../rmsnorm2d_fwd_fp16_n512_instance.cpp      |  13 +
 .../rmsnorm2d_fwd_fp16_n64_n128_instance.cpp  |  12 +
 .../rmsnorm2d_fwd_fp16_n768_instance.cpp      |  12 +
 .../rmsnorm2d_fwd_instance_common.hpp         |  65 ++++
 .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp    | 179 +++++++++++
 .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp    | 117 ++++++++
 .../ck_tile/10_rmsnorm2d/script/perf_test.sh  |  38 +++
 .../ck_tile/10_rmsnorm2d/script/smoke_test.sh |  31 ++
 .../11_add_rmsnorm2d_rdquant/CMakeLists.txt   |  25 ++
 .../11_add_rmsnorm2d_rdquant/README.md        |  22 ++
 .../add_rmsnorm2d_rdquant_fwd.cpp             | 279 +++++++++++++++++
 .../add_rmsnorm2d_rdquant_fwd.hpp             | 123 ++++++++
 .../example_add_rmsnorm2d_rdquant_fwd.cpp     | 280 ++++++++++++++++++
 .../add_rmsnorm2d_rdquant_fwd_api.cpp         | 157 ++++++++++
 ...norm2d_rdquant_fwd_bf16_n1024_instance.cpp |  22 ++
 ...norm2d_rdquant_fwd_bf16_n1536_instance.cpp |  13 +
 ...norm2d_rdquant_fwd_bf16_n2048_instance.cpp |  14 +
 ...snorm2d_rdquant_fwd_bf16_n256_instance.cpp |  12 +
 ...norm2d_rdquant_fwd_bf16_n3072_instance.cpp |  14 +
 ...norm2d_rdquant_fwd_bf16_n4096_instance.cpp |  14 +
 ...m2d_rdquant_fwd_bf16_n4096_tp_instance.cpp |  14 +
 ...snorm2d_rdquant_fwd_bf16_n512_instance.cpp |  13 +
 ...m2d_rdquant_fwd_bf16_n64_n128_instance.cpp |  12 +
 ...snorm2d_rdquant_fwd_bf16_n768_instance.cpp |  12 +
 ...norm2d_rdquant_fwd_fp16_n1024_instance.cpp |  22 ++
 ...norm2d_rdquant_fwd_fp16_n1536_instance.cpp |  13 +
 ...norm2d_rdquant_fwd_fp16_n2048_instance.cpp |  14 +
 ...snorm2d_rdquant_fwd_fp16_n256_instance.cpp |  12 +
 ...norm2d_rdquant_fwd_fp16_n3072_instance.cpp |  14 +
 ...norm2d_rdquant_fwd_fp16_n4096_instance.cpp |  14 +
 ...m2d_rdquant_fwd_fp16_n4096_tp_instance.cpp |  14 +
 ...snorm2d_rdquant_fwd_fp16_n512_instance.cpp |  13 +
 ...m2d_rdquant_fwd_fp16_n64_n128_instance.cpp |  12 +
 ...snorm2d_rdquant_fwd_fp16_n768_instance.cpp |  12 +
 ..._rmsnorm2d_rdquant_fwd_instance_common.hpp |  67 +++++
 .../script/perf_test.sh                       |  38 +++
 .../script/smoke_test.sh                      |  31 ++
 example/ck_tile/CMakeLists.txt                |   3 +-
 include/ck_tile/core.hpp                      |   1 +
 .../ck_tile/core/utility/reduce_operator.hpp  |  95 ++++++
 include/ck_tile/host.hpp                      |   3 +
 .../host/reference/reference_elementwise.hpp  |  47 +++
 .../host/reference/reference_reduce.hpp       |  17 +-
 .../reference/reference_rmsnorm2d_fwd.hpp     |  52 ++++
 .../reference_rowwise_quantization2d.hpp      |  33 +++
 include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp |  12 +
 .../add_rmsnorm2d_rdquant_fwd_kernel.hpp      | 239 +++++++++++++++
 .../add_rmsnorm2d_rdquant_fwd_shape.hpp       |  78 +++++
 ...2d_rdquant_fwd_pipeline_default_policy.hpp |  94 ++++++
 ...msnorm2d_rdquant_fwd_pipeline_one_pass.hpp | 142 +++++++++
 ...rmsnorm2d_rdquant_fwd_pipeline_problem.hpp |  41 +++
 ...norm2d_rdquant_fwd_pipeline_three_pass.hpp | 266 +++++++++++++++++
 .../layernorm2d_fwd_pipeline_one_pass.hpp     |   4 +-
 .../layernorm2d_fwd_pipeline_two_pass.hpp     |   6 +-
 include/ck_tile/ops/reduce.hpp                |   3 +
 .../ck_tile/ops/reduce/block/block_reduce.hpp |  19 +-
 .../ops/reduce/block/block_reduce2d.hpp       | 260 ++++++++++++++++
 .../block/block_reduce2d_default_policy.hpp   |  79 +++++
 .../reduce/block/block_reduce2d_problem.hpp   |  18 ++
 include/ck_tile/ops/rmsnorm2d.hpp             |  12 +
 .../rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp | 202 +++++++++++++
 .../rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp  |  78 +++++
 .../rmsnorm2d_fwd_pipeline_default_policy.hpp |  94 ++++++
 .../rmsnorm2d_fwd_pipeline_one_pass.hpp       | 101 +++++++
 .../rmsnorm2d_fwd_pipeline_problem.hpp        |  36 +++
 .../rmsnorm2d_fwd_pipeline_two_pass.hpp       | 131 ++++++++
 .../ops/welford/block/block_welford.hpp       |   8 +-
 90 files changed, 4667 insertions(+), 121 deletions(-)
 create mode 100644 example/ck_tile/10_rmsnorm2d/CMakeLists.txt
 create mode 100644 example/ck_tile/10_rmsnorm2d/README.md
 create mode 100644 example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
 create mode 100644 example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
 create mode 100755 example/ck_tile/10_rmsnorm2d/script/perf_test.sh
 create mode 100755 example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp
 create mode 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
 create mode 100755 example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh
 create mode 100755 example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh
 create mode 100644 include/ck_tile/core/utility/reduce_operator.hpp
 create mode 100644 include/ck_tile/host/reference/reference_elementwise.hpp
 create mode 100644 include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
 create mode 100644 include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp
 create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
 create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
 create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
 create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
 create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
 create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
 create mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
 create mode 100644 include/ck_tile/ops/reduce/block/block_reduce2d.hpp
 create mode 100644 include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
 create mode 100644 include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp
 create mode 100644 include/ck_tile/ops/rmsnorm2d.hpp
 create mode 100644 include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
 create mode 100644 include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
 create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
 create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
 create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
 create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp

diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp
index 7973a8dfd..005541dc6 100644
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -19,9 +19,9 @@ auto create_args(int argc, char* argv[])
 template <typename DataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    using ADataType   = DataType;
-    using AccDataType = float;
-    using BDataType   = DataType;
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = DataType;
 
     ck_tile::index_t m = arg_parser.get_int("m");
     ck_tile::index_t n = arg_parser.get_int("n");
@@ -29,35 +29,39 @@ bool run(const ck_tile::ArgParser& arg_parser)
     int warmup         = arg_parser.get_int("warmup");
     int repeat         = arg_parser.get_int("repeat");
 
-    ck_tile::HostTensor<ADataType> a_host({m, n});
-    ck_tile::HostTensor<BDataType> b_host_ref({m});
-    ck_tile::HostTensor<BDataType> b_host_dev({m});
+    ck_tile::HostTensor<XDataType> x_host({m, n});
+    ck_tile::HostTensor<YDataType> y_host_ref({m});
+    ck_tile::HostTensor<YDataType> y_host_dev({m});
 
-    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
+    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
 
-    ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_buf(b_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
 
-    a_buf.ToDevice(a_host.data());
+    x_buf.ToDevice(x_host.data());
 
+    using ReduceOp   = ck_tile::ReduceOp::Add;
     using BlockWarps = ck_tile::sequence<4, 1>;
     using BlockTile  = ck_tile::sequence<128, 128>;
     using WarpTile   = ck_tile::sequence<32, 128>;
-    using ThreadTile = ck_tile::sequence<8, 8>;
+    using Vector     = ck_tile::sequence<8, 8>;
 
-    constexpr ck_tile::index_t kBlockSize  = 256;
+    // cross warp-reduce
+    // using BlockWarps = ck_tile::sequence<2, 2>;
+    // using BlockTile  = ck_tile::sequence<2, 1024>;
+    // using WarpTile   = ck_tile::sequence<1, 512>;
+    // using Vector = ck_tile::sequence<1, 8>;
+
+    constexpr ck_tile::index_t kBlockSize  = 512;
     constexpr ck_tile::index_t kBlockPerCu = 1;
     ck_tile::index_t kGridSize             = (m / BlockTile::at(ck_tile::number<0>{}));
     std::cout << "grid size " << kGridSize << std::endl;
 
-    using Kernel = ck_tile::Reduce<ADataType,
-                                   AccDataType,
-                                   BDataType,
-                                   kBlockSize,
-                                   BlockWarps,
-                                   BlockTile,
-                                   WarpTile,
-                                   ThreadTile>;
+    using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
+    using Porblem =
+        ck_tile::Reduce2dProblem<XDataType, ComputeDataType, YDataType, Shape, ReduceOp>;
+
+    using Kernel = ck_tile::Reduce<Porblem>;
 
     float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
                                    ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
@@ -65,12 +69,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                        kGridSize,
                                        kBlockSize,
                                        0,
-                                       static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
-                                       static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
+                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
                                        m,
                                        n));
 
-    std::size_t num_btype = sizeof(ADataType) * m * n + sizeof(BDataType) * m;
+    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
 
     float gb_per_sec = num_btype / 1.E6 / ave_time;
 
@@ -81,9 +85,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(do_validation)
     {
         // reference
-        ck_tile::reference_reduce<ADataType, AccDataType, BDataType>(a_host, b_host_ref);
-        b_buf.FromDevice(b_host_dev.mData.data());
-        pass = ck_tile::check_err(b_host_dev, b_host_ref);
+        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
+            x_host, y_host_ref, ReduceOp{});
+        y_buf.FromDevice(y_host_dev.mData.data());
+        pass = ck_tile::check_err(y_host_dev, y_host_ref);
 
         std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
@@ -103,8 +108,8 @@ int main(int argc, char* argv[])
     {
         return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
     }
-    if(data_type == "bf16")
-    {
-        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
-    }
+    // else if(data_type == "bf16")
+    // {
+    //     return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+    // }
 }
diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp
index e36b46895..55e479591 100644
--- a/example/ck_tile/05_reduce/reduce.hpp
+++ b/example/ck_tile/05_reduce/reduce.hpp
@@ -5,20 +5,16 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
-
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
 
 namespace ck_tile {
 
-template <typename ADataType,
-          typename AccDataType,
-          typename BDataType,
-          index_t kBlockSize,
-          typename BlockWarps, // num warps along seq<M, N>
+template <typename BlockWarps, // num warps along seq<M, N>
           typename BlockTile,  // block size, seq<M, N>
           typename WarpTile,   // warp size, seq<M, N>
-          typename ThreadTile> // contiguous pixels(vector size) along seq<M, N>
-struct Reduce
+          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
+struct Reduce2dShape
 {
     static constexpr index_t Block_M = BlockTile::at(number<0>{});
     static constexpr index_t Block_N = BlockTile::at(number<1>{});
@@ -26,93 +22,143 @@ struct Reduce
     static constexpr index_t Warp_M = WarpTile::at(number<0>{});
     static constexpr index_t Warp_N = WarpTile::at(number<1>{});
 
-    static constexpr index_t Thread_M = ThreadTile::at(number<0>{});
-    static constexpr index_t Thread_N = ThreadTile::at(number<1>{});
+    static constexpr index_t Vector_M = Vector::at(number<0>{});
+    static constexpr index_t Vector_N = Vector::at(number<1>{});
 
     static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
     static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
 
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Thread_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Thread_N;
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
 
     static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
     static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
 
-    __device__ static constexpr auto MakeABlockTileDistribution()
-    {
-        return make_static_tile_distribution(
-            tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<Repeat_M, WarpPerBlock_M, ThreadPerWarp_M, Thread_M>,
-                      sequence<Repeat_N, WarpPerBlock_N, ThreadPerWarp_N, Thread_N>>,
-                tuple<sequence<1, 2>, sequence<1, 2>>,
-                tuple<sequence<1, 1>, sequence<2, 2>>,
-                sequence<1, 1, 2, 2>,
-                sequence<0, 3, 0, 3>>{});
-    }
+    static constexpr index_t BlockSize =
+        warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+};
+
+template <typename XDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename BlockShape_,
+          typename ReduceOp_>
+struct Reduce2dProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YDataType       = remove_cvref_t<YDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using ReduceOp        = ReduceOp_;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+};
+
+template <typename Problem_, typename Policy_ = BlockReduce2dDefaultPolicy>
+struct Reduce
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
 
-    __device__ void operator()(const ADataType* p_a, BDataType* p_b, index_t M, index_t N) const
+#if 0
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
+    const
     {
-        const auto a_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_a, make_tuple(M, N), make_tuple(N, 1), number<Thread_N>{}, number<1>{});
+        using S = typename Problem::BlockShape;
 
-        const auto iM = get_block_id() * Block_M;
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
 
-        // A window
-        auto a_block_window = make_tile_window(a_m_n,
-                                               make_tuple(number<Block_M>{}, number<Block_N>{}),
-                                               {iM, 0},
-                                               MakeABlockTileDistribution());
+        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
+            p_y, make_tuple(M), number<1>{});
+
+        const auto iM = get_block_id() * S::Block_M;
+
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+
+        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
 
         const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
 
-        const ADataType reduce_init_value = 0;
+        const XDataType reduce_init_value = 0;
 
         constexpr auto reduce_dims = sequence<1>{};
 
-        // Acc tile
-        // TODO: support cross warp reduction
-        auto acc_block_tensor = decltype(block_tile_reduce<AccDataType>(
-            load_tile(a_block_window), reduce_dims, f_reduce, reduce_init_value)){};
+        auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
+            load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
 
-        // init Acc tile
-        tile_elementwise_inout(
-            [&](auto& acc) { acc = type_convert<AccDataType>(reduce_init_value); },
-            acc_block_tensor);
+        set_tile(y_compute, reduce_init_value);
 
-        // loop
-        index_t iN = 0;
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
 
-        do
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
         {
-            const auto a_block_tensor = load_tile(a_block_window);
+            const auto x = load_tile(x_window);
+            block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
+            move_tile_window(x_window, {0, S::Block_N});
+        }
 
-            // FIXME: support cross warp reduction
-            block_tile_reduce(acc_block_tensor, a_block_tensor, reduce_dims, f_reduce);
+        block_tile_reduce_sync(y_compute, f_reduce);
+
+        store_tile(y_window, cast_tile<YDataType>(y_compute));
+    }
+#else
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const
+    {
+        using S = typename Problem::BlockShape;
 
-            move_tile_window(a_block_window, {0, Block_N});
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
 
-            iN += Block_N;
+        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
+            p_y, make_tuple(M), number<1>{});
 
-        } while(iN < N);
+        const auto iM = get_block_id() * S::Block_M;
 
-        // FIXME: support cross warp reduction
-        block_tile_reduce_sync(acc_block_tensor, f_reduce);
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
 
-        // convert acc_block_tensor to b_block_tensor
-        const auto b_block_tensor = tile_elementwise_in(
-            [](const auto& acc) { return type_convert<BDataType>(acc); }, acc_block_tensor);
+        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
 
-        // B
-        const auto b_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_b, make_tuple(M), number<32>{});
+        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
+
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
+
+        auto reduce_func         = typename Problem::ReduceOp{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(load_tile(x_window));
+        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            block_reduce2d(x, y_compute, reduce_func);
+            move_tile_window(x_window, {0, S::Block_N});
+        }
 
-        // B window
-        auto b_block_window = make_tile_window(b_m, make_tuple(number<Block_M>{}), {iM});
+        block_reduce2d_sync(y_compute, reduce_func);
+        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
 
-        // store B tile
-        store_tile(b_block_window, b_block_tensor);
+        store_tile(y_window, cast_tile<YDataType>(y_compute));
     }
+#endif
 };
 
 } // namespace ck_tile
diff --git a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
new file mode 100644
index 000000000..a3ff8fdf4
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding ${TILE_RMSNORM2D_FWD}")
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
+target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${INSTANCE_SRCS})
+
+set(TILE_RMSNORM2D_FWD_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
+
+set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd")
+add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp)
+target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
diff --git a/example/ck_tile/10_rmsnorm2d/README.md b/example/ck_tile/10_rmsnorm2d/README.md
new file mode 100644
index 000000000..c06749647
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/README.md
@@ -0,0 +1,22 @@
+# Rmsnorm2D forward
+
+This folder contains example for Rmsnorm2D forward using ck_tile tile-programming implementation.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_rmsnorm2d_fwd -j
+```
+This will result in an executable `build/bin/tile_rmsnorm2d_fwd`
+
+## cmdline
+```
+args:
+          -m    m dimension (default:3328)
+          -n    m dimension (default:4096)
+          -e    epsilon (default:1e-5)
+          -v    cpu validation or not (default:1)
+       -prec    precision (default:fp16)
+```
diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
new file mode 100644
index 000000000..bb2c94901
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -0,0 +1,165 @@
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/rmsnorm2d.hpp"
+#include <cstring>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "0", "cold iter")
+        .insert("repeat", "1", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
+    float epsilon         = arg_parser.get_float("e");
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= n);
+
+    using XDataType      = DataType;
+    using YDataType      = DataType;
+    using GammaDataType  = DataType;
+    using InvRmsDataType = ck_tile::null_type;
+
+    using ComputeDataType = float;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {stride, 1});
+
+    ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+
+    constexpr bool kTwoPass = true;
+
+    using BlockWarps = ck_tile::sequence<2, 2>;
+    using BlockTile  = ck_tile::sequence<2, 128>;
+    using WarpTile   = ck_tile::sequence<1, 64>;
+    using Vector     = ck_tile::sequence<1, 1>;
+
+    using Shape   = ck_tile::Rmsnorm2dShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Problem = ck_tile::Rmsnorm2dFwdPipelineProblem<XDataType,
+                                                         GammaDataType,
+                                                         ComputeDataType,
+                                                         YDataType,
+                                                         InvRmsDataType,
+                                                         Shape,
+                                                         true,  // kPadN
+                                                         false, // kSaveInvRms
+                                                         kTwoPass>;
+
+    using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass<Problem>;
+    using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass<Problem>;
+    using Pipeline        = std::conditional_t<kTwoPass, TwoPassPipeline, OnePassPipeline>;
+    using Kernel          = ck_tile::Rmsnorm2dFwd<Pipeline>;
+
+    ck_tile::Rmsnorm2dFwdHostArgs args{x_buf.GetDeviceBuffer(),
+                                       gamma_buf.GetDeviceBuffer(),
+                                       y_buf.GetDeviceBuffer(),
+                                       nullptr,
+                                       epsilon,
+                                       m,
+                                       n,
+                                       stride};
+
+    auto kargs = Kernel::MakeKargs(args);
+
+    const dim3 grids                       = Kernel::GridSize(args);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat};
+
+    ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                         GammaDataType,
+                                         ComputeDataType,
+                                         YDataType,
+                                         InvRmsDataType>(
+            x_host, gamma_host, y_host_ref, invRms_host_ref, epsilon);
+
+        y_buf.FromDevice(y_host_dev.data());
+
+        auto [rtol, atol] = ck_tile::make_tuple(1e-3, 1e-3);
+        if(stride == n)
+        {
+            pass = ck_tile::check_err(
+                y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+        }
+        else
+        {
+            for(int i_r = 0; i_r < m; i_r++)
+            {
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * stride,
+                                                      y_host_dev.begin() + i_r * stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * stride,
+                                                      y_host_ref.begin() + i_r * stride + n);
+                pass &= ck_tile::check_err(y_host_dev_row,
+                                           y_host_ref_row,
+                                           std::string("OUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+            }
+        }
+
+        std::cout << "[" << data_type << "]"
+                  << " m:" << m << ", n:" << n << ", stride:" << stride
+                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
new file mode 100644
index 000000000..f9cfe72de
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "rmsnorm2d_fwd.hpp"
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kTwoPass_>
+using trait_ = rmsnorm2d_fwd_traits_<DataType_,
+                                     Repeat_M_,
+                                     Repeat_N_,
+                                     ThreadPerBlock_M_,
+                                     ThreadPerBlock_N_,
+                                     Vector_N_,
+                                     kPadN_,
+                                     kSaveInvRms_,
+                                     kTwoPass_>;
+
+template <typename data_type>
+float rmsnorm2d_fwd_b16_(rmsnorm2d_fwd_traits /*t*/,
+                         rmsnorm2d_fwd_args a,
+                         const ck_tile::stream_config& s)
+{
+#if 1
+    float r = -1;
+    // clang-format off
+    //                                            rm  rn  tm   tn  vn  pd    rms     2p
+    if(a.n <= 64) {
+            r = rmsnorm2d_fwd_<trait_<data_type, 1,  1,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 128) {
+        if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type, 1,  1,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type, 1,  2,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 256) {
+        if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 1,  4,  64, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 512) {
+        if (a.n % 8 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 1,  4,  64, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2,  4,  64, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 8,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 768) {
+        if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 3,  4,  64, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 6,  4,  64, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1,12,  4,  64, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 1024) {
+        if (a.n % 8 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 1, 2,  128, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2, 2,  128, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 2,  128, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 1536) {
+        if (a.n % 8 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 3, 4,   64, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 3, 2,  128, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 3, 1,  256, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 6, 1,  256, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 2048) {
+        if (a.n % 8 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 1, 1,  256, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 8, 1,  256, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 3072) {
+        if (a.n % 8 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 3, 1,  128, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 3, 1,  256, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 6, 1,  256, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 3, 1, 1024, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n <= 4096) {
+        if (a.n % 8 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  false, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  false, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  false, false>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  false, false>>(s, a);
+    }
+    else if(a.n > 4096) {
+        if (a.n % 8 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  false, true>>(s, a);
+        else if (a.n % 4 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  false, true>>(s, a);
+        else if (a.n % 2 == 0)
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  false, true>>(s, a);
+        else
+            r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  false, true>>(s, a);
+    }
+    return r;
+#else
+    return rmsnorm2d_fwd_<trait_<data_type,  1, 1,  1,  256, 4,  true,  false, false>>(s, a);
+#endif
+    // clang-format on
+}
+
+float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t, rmsnorm2d_fwd_args a, const ck_tile::stream_config& s)
+{
+
+    float r = -1;
+    if(t.data_type.compare("fp16") == 0)
+    {
+        return rmsnorm2d_fwd_b16_<ck_tile::fp16_t>(t, a, s);
+    }
+    else if(t.data_type.compare("bf16") == 0)
+    {
+        return rmsnorm2d_fwd_b16_<ck_tile::bf16_t>(t, a, s);
+    }
+    if(r < 0)
+        throw std::runtime_error("Without supported instances!");
+
+    return r;
+}
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp
new file mode 100644
index 000000000..5e2a35f9e
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+#if 0
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
+
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
+#endif
+
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp
new file mode 100644
index 000000000..8c734806e
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp
new file mode 100644
index 000000000..922200143
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp
new file mode 100644
index 000000000..ed33c8492
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp
new file mode 100644
index 000000000..b753bbc34
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp
new file mode 100644
index 000000000..27cb9bdf3
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..23afb5672
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp
new file mode 100644
index 000000000..b428f5805
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 8,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp
new file mode 100644
index 000000000..300110669
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 1,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp
new file mode 100644
index 000000000..e9c8d6a1d
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  3,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  6,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 12,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp
new file mode 100644
index 000000000..15198eebe
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+#if 0
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
+
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
+#endif
+
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp
new file mode 100644
index 000000000..8ac85fa9b
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp
new file mode 100644
index 000000000..10e8fafc2
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp
new file mode 100644
index 000000000..4e1a80bf6
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp
new file mode 100644
index 000000000..45e56a92b
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp
new file mode 100644
index 000000000..35401f6f8
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..1e3700fad
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp
new file mode 100644
index 000000000..cdc4d00bd
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 8,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp
new file mode 100644
index 000000000..ec80c2ee4
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 1,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp
new file mode 100644
index 000000000..ddfc5a54e
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                       rm  rn  tm  tn  vn  pd    rms     2p
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  3,  4,  64, 4,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  6,  4,  64, 2,  true , false, false>>(const S&, A);
+template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 12,  4,  64, 1,  true , false, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp
new file mode 100644
index 000000000..8f6ff84b6
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_instance_common.hpp
@@ -0,0 +1,65 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "rmsnorm2d_fwd.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = rmsnorm2d_fwd_args;
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kTwoPass_>
+using trait_ = rmsnorm2d_fwd_traits_<DataType_,
+                                     Repeat_M_,
+                                     Repeat_N_,
+                                     ThreadPerBlock_M_,
+                                     ThreadPerBlock_N_,
+                                     Vector_N_,
+                                     kPadN_,
+                                     kSaveInvRms_,
+                                     kTwoPass_>;
+
+template <typename Traits_>
+float rmsnorm2d_fwd_(const S& s, A a)
+{
+    using DataType = typename Traits_::DataType;
+
+    using PipelineProblem =
+        ck_tile::Rmsnorm2dFwdPipelineProblem<typename RmsnormTypeConfig<DataType>::XDataType,
+                                             typename RmsnormTypeConfig<DataType>::GammaDataType,
+                                             typename RmsnormTypeConfig<DataType>::ComputeDataType,
+                                             typename RmsnormTypeConfig<DataType>::YDataType,
+                                             typename RmsnormTypeConfig<DataType>::InvRmsDataType,
+                                             typename Traits_::Shape,
+                                             Traits_::kPadN,
+                                             Traits_::kSaveInvRms,
+                                             Traits_::kTwoPass>;
+
+    using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::Rmsnorm2dFwd<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
new file mode 100644
index 000000000..698a8b43e
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -0,0 +1,179 @@
+#include "ck_tile/host.hpp"
+#include "rmsnorm2d_fwd.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("save_rms", "0", "save rms(invrms) or not. set to 1 in training case")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType, bool SaveRms>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
+    float epsilon         = arg_parser.get_float("e");
+    std::string data_type = arg_parser.get_str("prec");
+    int kname             = arg_parser.get_int("kname");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= n);
+
+    using TypeConfig = RmsnormTypeConfig<DataType>;
+
+    using XDataType     = typename TypeConfig::XDataType;
+    using YDataType     = typename TypeConfig::YDataType;
+    using GammaDataType = typename TypeConfig::GammaDataType;
+
+    using InvRmsDataType =
+        std::conditional_t<SaveRms, typename TypeConfig::InvRmsDataType, ck_tile::null_type>;
+
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {stride, 1});
+
+    ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+
+    std::cout << "[" << data_type << "]"
+              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+
+    rmsnorm2d_fwd_traits traits{data_type, SaveRms};
+
+    rmsnorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
+                            gamma_buf.GetDeviceBuffer(),
+                            y_buf.GetDeviceBuffer(),
+                            nullptr,
+                            epsilon,
+                            m,
+                            n,
+                            stride};
+
+    float ave_time = rmsnorm2d_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte =
+        sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + sizeof(YDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                         GammaDataType,
+                                         ComputeDataType,
+                                         YDataType,
+                                         InvRmsDataType>(
+            x_host, gamma_host, y_host_ref, invRms_host_ref, epsilon);
+
+        y_buf.FromDevice(y_host_dev.data());
+
+        auto [rtol, atol] = get_elimit<DataType>();
+        if(stride == n)
+        {
+            pass = ck_tile::check_err(
+                y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+        }
+        else
+        {
+            for(int i_r = 0; i_r < m; i_r++)
+            {
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * stride,
+                                                      y_host_dev.begin() + i_r * stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * stride,
+                                                      y_host_ref.begin() + i_r * stride + n);
+                pass &= ck_tile::check_err(y_host_dev_row,
+                                           y_host_ref_row,
+                                           std::string("OUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    int save_rms                = arg_parser.get_int("save_rms");
+    if(data_type == "fp16" && save_rms)
+    {
+        return run<ck_tile::half_t, true>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp16" && !save_rms)
+    {
+        return run<ck_tile::half_t, false>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16" && save_rms)
+    {
+        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16" && !save_rms)
+    {
+        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
new file mode 100644
index 000000000..756ecb2c4
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/rmsnorm2d.hpp"
+#include <string>
+
+template <typename DataType>
+struct RmsnormTypeConfig;
+
+template <>
+struct RmsnormTypeConfig<ck_tile::half_t>
+{
+    using XDataType       = ck_tile::half_t;
+    using YDataType       = ck_tile::half_t;
+    using GammaDataType   = ck_tile::half_t;
+    using InvRmsDataType  = ck_tile::half_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct RmsnormTypeConfig<ck_tile::bf16_t>
+{
+    using XDataType       = ck_tile::bf16_t;
+    using YDataType       = ck_tile::bf16_t;
+    using GammaDataType   = ck_tile::bf16_t;
+    using InvRmsDataType  = ck_tile::bf16_t;
+    using ComputeDataType = float;
+};
+
+// runtime args
+struct rmsnorm2d_fwd_args : public ck_tile::Rmsnorm2dFwdHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kTwoPass_>
+struct rmsnorm2d_fwd_traits_
+{
+    using DataType = ck_tile::remove_cvref_t<DataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (warpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / warpSize);
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % warpSize == 0);
+            return ThreadPerBlock_N_ / warpSize;
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Rmsnorm2dShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN       = kPadN_;
+    static constexpr bool kSaveInvRms = kSaveInvRms_;
+    static constexpr bool kTwoPass    = kTwoPass_;
+};
+
+template <typename Traits_>
+float rmsnorm2d_fwd_(const ck_tile::stream_config& s, rmsnorm2d_fwd_args a);
+
+// This is the public API, will be generated by script
+struct rmsnorm2d_fwd_traits
+{
+    std::string data_type;
+    bool save_rms;
+};
+
+float rmsnorm2d_fwd(rmsnorm2d_fwd_traits, rmsnorm2d_fwd_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
new file mode 100755
index 000000000..f3cfcc4b8
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
@@ -0,0 +1,38 @@
+
+# run from top of ck folder
+EXE=build/bin/tile_rmsnorm2d_fwd
+
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec=fp16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
\ No newline at end of file
diff --git a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
new file mode 100755
index 000000000..6ec5e846c
--- /dev/null
+++ b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# call from top of CK folder
+EXE=./build/bin/tile_rmsnorm2d_fwd
+
+for pr_i in "fp16" "bf16" ; do
+$EXE -prec=$pr_i -m=99  -n=13
+$EXE -prec=$pr_i -m=17  -n=16
+$EXE -prec=$pr_i -m=1   -n=100
+$EXE -prec=$pr_i -m=4   -n=128
+$EXE -prec=$pr_i -m=80  -n=127
+$EXE -prec=$pr_i -m=22  -n=255 -stride=256
+$EXE -prec=$pr_i -m=7   -n=599
+$EXE -prec=$pr_i -m=19  -n=512
+$EXE -prec=$pr_i -m=33  -n=313 -stride=1000
+$EXE -prec=$pr_i -m=11  -n=510
+$EXE -prec=$pr_i -m=171 -n=676 -stride=818
+$EXE -prec=$pr_i -m=91  -n=636
+$EXE -prec=$pr_i -m=12  -n=768 -stride=800
+$EXE -prec=$pr_i -m=100 -n=766 -stride=812
+$EXE -prec=$pr_i -m=31  -n=1024
+$EXE -prec=$pr_i -m=64  -n=1000 -stride=1004
+$EXE -prec=$pr_i -m=8   -n=1501
+$EXE -prec=$pr_i -m=3   -n=1826
+$EXE -prec=$pr_i -m=5   -n=2040
+$EXE -prec=$pr_i -m=7   -n=2734
+$EXE -prec=$pr_i -m=1   -n=3182
+$EXE -prec=$pr_i -m=9   -n=4096
+$EXE -prec=$pr_i -m=3   -n=8192
+$EXE -prec=$pr_i -m=1   -n=10547
+$EXE -prec=$pr_i -m=3   -n=17134
+done
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
new file mode 100644
index 000000000..6b0c3cef7
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
+target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS})
+
+set(TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
+
+set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd")
+add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp)
+target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
new file mode 100644
index 000000000..960369b78
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
@@ -0,0 +1,22 @@
+# Add + Rmsnorm2D + rowwise dynamic quantization forward
+
+This folder contains example for add + Rmsnorm2D + rowwise dynamic quantization forward using ck_tile tile-programming implementation. Rdquant is short for rowwise dynamic quantization here.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_add_rmsnorm2d_rdquant_fwd -j
+```
+This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd`
+
+## cmdline
+```
+args:
+          -m    m dimension (default:3328)
+          -n    m dimension (default:4096)
+          -e    epsilon (default:1e-5)
+          -v    cpu validation or not (default:1)
+       -prec    precision (default:fp16)
+```
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
new file mode 100644
index 000000000..43bc9a6cf
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -0,0 +1,279 @@
+#include "ck_tile/host.hpp"
+#include "add_rmsnorm2d_rdquant_fwd.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("save_x", "1", "save rms(invrms) or not. set to 1 in training case")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType, bool SaveX>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
+    float epsilon         = arg_parser.get_float("e");
+    std::string data_type = arg_parser.get_str("prec");
+    int kname             = arg_parser.get_int("kname");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= n);
+
+    using TypeConfig = AddRmsnormRdquantTypeConfig<DataType>;
+
+    using ADataType       = typename TypeConfig::ADataType;
+    using BDataType       = typename TypeConfig::BDataType;
+    using GammaDataType   = typename TypeConfig::GammaDataType;
+    using XDataType       = typename TypeConfig::XDataType;
+    using YScaleDataType  = typename TypeConfig::YScaleDataType;
+    using QYDataType      = typename TypeConfig::QYDataType;
+    using ComputeDataType = float;
+
+    // host verify
+    ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<BDataType> b_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+
+    ck_tile::HostTensor<XDataType> x_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host_dev({m, n}, {stride, 1});
+
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+
+    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+    ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+
+    ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf(x_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    a_buf.ToDevice(a_host.data());
+    b_buf.ToDevice(b_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+
+    std::cout << "[" << data_type << "]"
+              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+
+    add_rmsnorm2d_rdquant_fwd_traits traits{data_type, SaveX};
+
+    add_rmsnorm2d_rdquant_fwd_args args{a_buf.GetDeviceBuffer(),
+                                        b_buf.GetDeviceBuffer(),
+                                        gamma_buf.GetDeviceBuffer(),
+                                        x_buf.GetDeviceBuffer(),
+                                        yscale_buf.GetDeviceBuffer(),
+                                        qy_buf.GetDeviceBuffer(),
+                                        epsilon,
+                                        m,
+                                        n,
+                                        stride};
+
+    float ave_time = add_rmsnorm2d_rdquant_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte = sizeof(ADataType) * m * n + sizeof(BDataType) * m * n +
+                           sizeof(GammaDataType) * n + sizeof(YScaleDataType) * m +
+                           sizeof(QYDataType) * m * n;
+
+    if constexpr(SaveX)
+        num_byte += sizeof(XDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType      = ComputeDataType;
+        using InvRmsDataType = DataType;
+
+        // Add
+        {
+            auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+            ck_tile::reference_binary_elementwise<ADataType, BDataType, XDataType, ComputeDataType>(
+                a_host, b_host, x_host_ref, op);
+
+            x_buf.FromDevice(x_host_dev.data());
+
+            auto [rtol, atol] = get_elimit<XDataType>();
+            if(stride == n)
+            {
+                pass = ck_tile::check_err(
+                    x_host_dev, x_host_ref, std::string("x Error: Incorrect results!"), rtol, atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> x_host_dev_row(x_host_dev.begin() + i_r * stride,
+                                                           x_host_dev.begin() + i_r * stride + n);
+                    std::vector<QYDataType> x_host_ref_row(x_host_ref.begin() + i_r * stride,
+                                                           x_host_ref.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(x_host_dev_row,
+                                               x_host_ref_row,
+                                               std::string("x[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        ck_tile::HostTensor<YDataType> y_host({m, n});
+        // Rmsnorm2d
+        {
+            ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+
+            // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
+            // simplicity
+            ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                             GammaDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             InvRmsDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<YDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    int save_x                  = arg_parser.get_int("save_x");
+    if(data_type == "fp16" && save_x)
+    {
+        return run<ck_tile::half_t, true>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp16" && !save_x)
+    {
+        return run<ck_tile::half_t, false>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16" && save_x)
+    {
+        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16" && !save_x)
+    {
+        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
new file mode 100644
index 000000000..bf70d9d23
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant.hpp"
+#include <string>
+
+template <typename DataType>
+struct AddRmsnormRdquantTypeConfig;
+
+template <>
+struct AddRmsnormRdquantTypeConfig<ck_tile::half_t>
+{
+    using ADataType       = ck_tile::half_t;
+    using BDataType       = ck_tile::half_t;
+    using GammaDataType   = ck_tile::half_t;
+    using XDataType       = ck_tile::half_t;
+    using YScaleDataType  = ck_tile::half_t;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct AddRmsnormRdquantTypeConfig<ck_tile::bf16_t>
+{
+    using ADataType       = ck_tile::bf16_t;
+    using BDataType       = ck_tile::bf16_t;
+    using GammaDataType   = ck_tile::bf16_t;
+    using XDataType       = ck_tile::bf16_t;
+    using YScaleDataType  = ck_tile::bf16_t;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+// runtime args
+struct add_rmsnorm2d_rdquant_fwd_args : public ck_tile::AddRmsnorm2dRdquantFwdHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveX_,
+          bool kThreePass_>
+struct add_rmsnorm2d_rdquant_fwd_traits_
+{
+    using DataType = ck_tile::remove_cvref_t<DataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (warpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / warpSize);
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % warpSize == 0);
+            return ThreadPerBlock_N_ / warpSize;
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::AddRmsnorm2dRdquantShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN      = kPadN_;
+    static constexpr bool kSaveX     = kSaveX_;
+    static constexpr bool kThreePass = kThreePass_;
+};
+
+template <typename Traits_>
+float add_rmsnorm2d_rdquant_fwd_(const ck_tile::stream_config& s, add_rmsnorm2d_rdquant_fwd_args a);
+
+// This is the public API, will be generated by script
+struct add_rmsnorm2d_rdquant_fwd_traits
+{
+    std::string data_type;
+    bool save_x;
+};
+
+float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits,
+                                add_rmsnorm2d_rdquant_fwd_args,
+                                const ck_tile::stream_config&);
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
new file mode 100644
index 000000000..40fabf7f5
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -0,0 +1,280 @@
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "0", "cold iter")
+        .insert("repeat", "1", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
+    float epsilon         = arg_parser.get_float("e");
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= n);
+
+    using ADataType       = DataType;
+    using BDataType       = DataType;
+    using GammaDataType   = DataType;
+    using XDataType       = DataType;
+    using YScaleDataType  = DataType;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+
+    // host verify
+    ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<BDataType> b_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+
+    ck_tile::HostTensor<XDataType> x_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+
+    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+    ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+
+    ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf(x_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    a_buf.ToDevice(a_host.data());
+    b_buf.ToDevice(b_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+
+    constexpr bool kThreePass = true;
+
+    using BlockWarps = ck_tile::sequence<2, 2>;
+    using BlockTile  = ck_tile::sequence<2, 128>;
+    using WarpTile   = ck_tile::sequence<1, 64>;
+    using Vector     = ck_tile::sequence<1, 1>;
+
+    using Shape   = ck_tile::AddRmsnorm2dRdquantShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Problem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem<ADataType,
+                                                                   BDataType,
+                                                                   GammaDataType,
+                                                                   ComputeDataType,
+                                                                   XDataType,
+                                                                   YScaleDataType,
+                                                                   QYDataType,
+                                                                   Shape,
+                                                                   true, // kPadN
+                                                                   true, // kSaveX
+                                                                   kThreePass>;
+
+    using OnePassPipeline   = ck_tile::AddRmsnorm2dRdquantFwdPipelineOnePass<Problem>;
+    using ThreePassPipeline = ck_tile::AddRmsnorm2dRdquantFwdPipelineThreePass<Problem>;
+    using Pipeline          = std::conditional_t<kThreePass, ThreePassPipeline, OnePassPipeline>;
+    using Kernel            = ck_tile::AddRmsnorm2dRdquantFwd<Pipeline>;
+
+    ck_tile::AddRmsnorm2dRdquantFwdHostArgs args{a_buf.GetDeviceBuffer(),
+                                                 b_buf.GetDeviceBuffer(),
+                                                 gamma_buf.GetDeviceBuffer(),
+                                                 x_buf.GetDeviceBuffer(),
+                                                 yscale_buf.GetDeviceBuffer(),
+                                                 qy_buf.GetDeviceBuffer(),
+                                                 epsilon,
+                                                 m,
+                                                 n,
+                                                 stride};
+
+    auto kargs = Kernel::MakeKargs(args);
+
+    const dim3 grids                       = Kernel::GridSize(args);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat};
+
+    ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType      = ComputeDataType;
+        using InvRmsDataType = DataType;
+
+        // Add
+        {
+            auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+            ck_tile::reference_binary_elementwise<ADataType, BDataType, XDataType, ComputeDataType>(
+                a_host, b_host, x_host_ref, op);
+
+            x_buf.FromDevice(x_host_dev.data());
+
+            auto [rtol, atol] = get_elimit<XDataType>();
+            if(stride == n)
+            {
+                pass = ck_tile::check_err(
+                    x_host_dev, x_host_ref, std::string("x Error: Incorrect results!"), rtol, atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> x_host_dev_row(x_host_dev.begin() + i_r * stride,
+                                                           x_host_dev.begin() + i_r * stride + n);
+                    std::vector<QYDataType> x_host_ref_row(x_host_ref.begin() + i_r * stride,
+                                                           x_host_ref.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(x_host_dev_row,
+                                               x_host_ref_row,
+                                               std::string("x[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        ck_tile::HostTensor<YDataType> y_host({m, n});
+        // Rmsnorm2d
+        {
+            ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+
+            // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
+            // simplicity
+            ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                             GammaDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             InvRmsDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<YDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << "[" << data_type << "]"
+                  << " m:" << m << ", n:" << n << ", stride:" << stride
+                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
new file mode 100644
index 000000000..57a0f254d
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "add_rmsnorm2d_rdquant_fwd.hpp"
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveX_,
+          bool kThreePass_>
+using trait_ = add_rmsnorm2d_rdquant_fwd_traits_<DataType_,
+                                                 Repeat_M_,
+                                                 Repeat_N_,
+                                                 ThreadPerBlock_M_,
+                                                 ThreadPerBlock_N_,
+                                                 Vector_N_,
+                                                 kPadN_,
+                                                 kSaveX_,
+                                                 kThreePass_>;
+
+template <typename data_type>
+float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits /*t*/,
+                                     add_rmsnorm2d_rdquant_fwd_args a,
+                                     const ck_tile::stream_config& s)
+{
+#if 1
+    float r = -1;
+    // clang-format off
+    //                                                      rm  rn  tm   tn  vn   pd     x      3p
+    if(a.n <= 64) {
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type, 1,  1,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 128) {
+        if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type, 1,  1,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type, 1,  2,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 256) {
+        if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 1,  4,  64, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 512) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 1,  4,  64, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2,  4,  64, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 8,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 768) {
+        if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 3,  4,  64, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 6,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1,12,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 1024) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 1, 2,  128, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2, 2,  128, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 2,  128, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 1,  256, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 1536) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 3, 4,   64, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 3, 2,  128, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 3, 1,  256, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 6, 1,  256, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 2048) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 1, 1,  256, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2, 1,  256, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 1,  256, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 8, 1,  256, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 3072) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 3, 1,  128, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 3, 1,  256, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 6, 1,  256, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 3, 1, 1024, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 4096) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n > 4096) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  true, true>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  true, true>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  true, true>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  true, true>>(s, a);
+    }
+    return r;
+#else
+    return add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 1, 2,  128, 8,  true,  true, false>>(s, a);
+#endif
+    // clang-format on
+}
+
+float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits t,
+                                add_rmsnorm2d_rdquant_fwd_args a,
+                                const ck_tile::stream_config& s)
+{
+
+    float r = -1;
+    // Only support instance of save_x == true for now
+    assert(t.save_x);
+    if(t.data_type.compare("fp16") == 0)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::fp16_t>(t, a, s);
+    }
+    else if(t.data_type.compare("bf16") == 0)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::bf16_t>(t, a, s);
+    }
+    if(r < 0)
+        throw std::runtime_error("Without supported instances!");
+
+    return r;
+}
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
new file mode 100644
index 000000000..5495e3c9a
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                                rm  rn  tm  tn  vn  pd      x     3p
+#if 0
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
+#endif
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp
new file mode 100644
index 000000000..8bbfdc858
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 3, 4,   64, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 3, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp
new file mode 100644
index 000000000..381a11fc8
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 1, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 8, 1,  256, 1,  true,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp
new file mode 100644
index 000000000..2fefac693
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp
new file mode 100644
index 000000000..263713bbc
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1,  true,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp
new file mode 100644
index 000000000..c62c596fa
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..e4951f6ab
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  true, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp
new file mode 100644
index 000000000..4c7ee48e8
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp
new file mode 100644
index 000000000..8659dc82b
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn    pd     x      3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp
new file mode 100644
index 000000000..5f15f11b4
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  3,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  6,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 12,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
new file mode 100644
index 000000000..8ffdacbdc
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                                rm  rn  tm  tn  vn  pd      x     3p
+#if 0
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
+#endif
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp
new file mode 100644
index 000000000..355109965
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 3, 4,   64, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 3, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp
new file mode 100644
index 000000000..d4d0474c2
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 1, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 8, 1,  256, 1,  true,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp
new file mode 100644
index 000000000..2cb300eda
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp
new file mode 100644
index 000000000..fb0ceb4c5
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,  true, true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,  true, true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,  true, true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,  true, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp
new file mode 100644
index 000000000..3a241a3c9
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true, true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true, true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true, true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..d3094679f
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true, true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true, true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true, true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true, true, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp
new file mode 100644
index 000000000..919bc177e
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp
new file mode 100644
index 000000000..8a44f5e00
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp
new file mode 100644
index 000000000..5c4f05ec3
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  3,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  6,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 12,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
new file mode 100644
index 000000000..6baaad471
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
@@ -0,0 +1,67 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "add_rmsnorm2d_rdquant_fwd.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = add_rmsnorm2d_rdquant_fwd_args;
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kTwoPass_>
+using trait_ = add_rmsnorm2d_rdquant_fwd_traits_<DataType_,
+                                                 Repeat_M_,
+                                                 Repeat_N_,
+                                                 ThreadPerBlock_M_,
+                                                 ThreadPerBlock_N_,
+                                                 Vector_N_,
+                                                 kPadN_,
+                                                 kSaveInvRms_,
+                                                 kTwoPass_>;
+
+template <typename Traits_>
+float add_rmsnorm2d_rdquant_fwd_(const S& s, A a)
+{
+    using DataType = typename Traits_::DataType;
+
+    using PipelineProblem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem<
+        typename AddRmsnormRdquantTypeConfig<DataType>::ADataType,
+        typename AddRmsnormRdquantTypeConfig<DataType>::BDataType,
+        typename AddRmsnormRdquantTypeConfig<DataType>::GammaDataType,
+        typename AddRmsnormRdquantTypeConfig<DataType>::ComputeDataType,
+        typename AddRmsnormRdquantTypeConfig<DataType>::XDataType,
+        typename AddRmsnormRdquantTypeConfig<DataType>::YScaleDataType,
+        typename AddRmsnormRdquantTypeConfig<DataType>::QYDataType,
+        typename Traits_::Shape,
+        Traits_::kPadN,
+        Traits_::kSaveX,
+        Traits_::kThreePass>;
+
+    using OnePassPipeline   = ck_tile::AddRmsnorm2dRdquantFwdPipelineOnePass<PipelineProblem>;
+    using ThreePassPipeline = ck_tile::AddRmsnorm2dRdquantFwdPipelineThreePass<PipelineProblem>;
+    using Pipeline = std::conditional_t<Traits_::kThreePass, ThreePassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::AddRmsnorm2dRdquantFwd<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh
new file mode 100755
index 000000000..11fd36488
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh
@@ -0,0 +1,38 @@
+
+# run from top of ck folder
+EXE=build/bin/tile_add_rmsnorm2d_rdquant_fwd
+
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec=fp16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
\ No newline at end of file
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh
new file mode 100755
index 000000000..4a02cdcb6
--- /dev/null
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# call from top of CK folder
+EXE=./build/bin/tile_add_rmsnorm2d_rdquant_fwd
+
+for pr_i in "fp16" "bf16" ; do
+$EXE -prec=$pr_i -m=99  -n=13
+$EXE -prec=$pr_i -m=17  -n=16
+$EXE -prec=$pr_i -m=1   -n=100
+$EXE -prec=$pr_i -m=4   -n=128
+$EXE -prec=$pr_i -m=80  -n=127
+$EXE -prec=$pr_i -m=22  -n=255 -stride=256
+$EXE -prec=$pr_i -m=7   -n=599
+$EXE -prec=$pr_i -m=19  -n=512
+$EXE -prec=$pr_i -m=33  -n=313 -stride=1000
+$EXE -prec=$pr_i -m=11  -n=510
+$EXE -prec=$pr_i -m=171 -n=676 -stride=818
+$EXE -prec=$pr_i -m=91  -n=636
+$EXE -prec=$pr_i -m=12  -n=768 -stride=800
+$EXE -prec=$pr_i -m=100 -n=766 -stride=812
+$EXE -prec=$pr_i -m=31  -n=1024
+$EXE -prec=$pr_i -m=64  -n=1000 -stride=1004
+$EXE -prec=$pr_i -m=8   -n=1501
+$EXE -prec=$pr_i -m=3   -n=1826
+$EXE -prec=$pr_i -m=5   -n=2040
+$EXE -prec=$pr_i -m=7   -n=2734
+$EXE -prec=$pr_i -m=1   -n=3182
+$EXE -prec=$pr_i -m=9   -n=4096
+$EXE -prec=$pr_i -m=3   -n=8192
+$EXE -prec=$pr_i -m=1   -n=10547
+$EXE -prec=$pr_i -m=3   -n=17134
+done
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index c85e31341..e404e5019 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -9,4 +9,5 @@ add_subdirectory(04_img2col)
 add_subdirectory(05_reduce)
 add_subdirectory(06_permute)
 add_subdirectory(09_topk_softmax)
-
+add_subdirectory(10_rmsnorm2d)
+add_subdirectory(11_add_rmsnorm2d_rdquant)
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 14991d375..fa4b8d3cc 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -59,6 +59,7 @@
 #include "ck_tile/core/utility/magic_div.hpp"
 #include "ck_tile/core/utility/philox_rand.hpp"
 #include "ck_tile/core/utility/random.hpp"
+#include "ck_tile/core/utility/reduce_operator.hpp"
 #include "ck_tile/core/utility/to_sequence.hpp"
 #include "ck_tile/core/utility/transpose_vectors.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp
new file mode 100644
index 000000000..8b15d187f
--- /dev/null
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+
+namespace ck_tile {
+
+namespace ReduceOp {
+// y = ReduceOp(y, x);
+struct Add
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(0.0f);
+    };
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return y + x;
+    }
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
+    {
+        float y_ = type_convert<float>(y);
+        float x_ = type_convert<float>(x);
+
+        return type_convert<T>(y_ + x_);
+    }
+};
+
+struct SquareAdd
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(0.0f);
+    };
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return y + (x * x);
+    }
+};
+
+struct Max
+{
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return numeric<T>::min();
+    };
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return max(y, x);
+    }
+};
+
+struct AbsMax
+{
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return numeric<T>::min();
+    };
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return max(y, abs(x));
+    }
+};
+
+} // namespace ReduceOp
+} // namespace ck_tile
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index a17ce751c..c0ab13ce3 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -19,11 +19,14 @@
 #include "ck_tile/host/reference/reference_batched_masking.hpp"
 #include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp"
 #include "ck_tile/host/reference/reference_batched_softmax.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
 #include "ck_tile/host/reference/reference_permute.hpp"
 #include "ck_tile/host/reference/reference_reduce.hpp"
+#include "ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp"
+#include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
 #include "ck_tile/host/reference/reference_topk.hpp"
 #include "ck_tile/host/stream_config.hpp"
diff --git a/include/ck_tile/host/reference/reference_elementwise.hpp b/include/ck_tile/host/reference/reference_elementwise.hpp
new file mode 100644
index 000000000..809049fa6
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_elementwise.hpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+template <typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
+CK_TILE_HOST void reference_unary_elementwise(const HostTensor<ADataType>& a,
+                                              HostTensor<BDataType>& b,
+                                              ElementOp element_op)
+{
+    // TODO: imeplement gpu version reference function
+    auto f = [&](auto i) {
+        auto v_a   = type_convert<ComputeDataType>(a.mData[i]);
+        auto v_b   = element_op(v_a);
+        b.mData[i] = ck_tile::type_convert<BDataType>(v_b);
+    };
+
+    make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency());
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ElementOp>
+CK_TILE_HOST void reference_binary_elementwise(const HostTensor<ADataType>& a,
+                                               const HostTensor<BDataType>& b,
+                                               HostTensor<CDataType>& c,
+                                               ElementOp element_op)
+{
+    // TODO: imeplement gpu version reference function
+    auto f = [&](auto i) {
+        auto v_a   = type_convert<ComputeDataType>(a.mData[i]);
+        auto v_b   = type_convert<ComputeDataType>(b.mData[i]);
+        auto v_c   = element_op(v_a, v_b);
+        c.mData[i] = ck_tile::type_convert<CDataType>(v_c);
+    };
+
+    make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_reduce.hpp b/include/ck_tile/host/reference/reference_reduce.hpp
index b16cee3f9..8f8aa2367 100644
--- a/include/ck_tile/host/reference/reference_reduce.hpp
+++ b/include/ck_tile/host/reference/reference_reduce.hpp
@@ -9,24 +9,25 @@
 
 namespace ck_tile {
 
-template <typename ADataType, typename AccDataType, typename BDataType>
-CK_TILE_HOST void reference_reduce(const HostTensor<ADataType>& a_m_n, HostTensor<BDataType>& b_m)
+template <typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp>
+CK_TILE_HOST void
+reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m, ReduceOp reduce_op)
 {
     auto f = [&](auto m) {
-        const int N = a_m_n.mDesc.get_lengths()[1];
+        const int N = x_m_n.mDesc.get_lengths()[1];
 
-        AccDataType v_acc = 0;
+        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
 
         for(int n = 0; n < N; ++n)
         {
-            const ADataType v_a = a_m_n(m, n);
+            const ComputeDataType v_a = type_convert<ComputeDataType>(x_m_n(m, n));
 
-            v_acc += v_a;
+            v_acc = reduce_op(v_acc, v_a);
         }
 
-        b_m(m) = ck_tile::type_convert<BDataType>(v_acc);
+        y_m(m) = ck_tile::type_convert<YDataType>(v_acc);
     };
 
-    make_ParallelTensorFunctor(f, b_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
 }
 } // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
new file mode 100644
index 000000000..db6e92f4c
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename InvRmsDataType>
+void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
+                             const HostTensor<GammaDataType>& gamma_n,
+                             HostTensor<YDataType>& y_m_n,
+                             HostTensor<InvRmsDataType>& invRms_m,
+                             ComputeDataType epsilon)
+{
+    auto rmsnorm2d_fwd_func = [&](auto m) {
+        const int N = x_m_n.mDesc.get_lengths()[1];
+
+        ComputeDataType mean_square = 0;
+        ComputeDataType divisor     = 0;
+
+        for(int n = 0; n < N; ++n)
+        {
+            ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
+            mean_square += x * x;
+        }
+
+        mean_square = mean_square / N;
+        divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(mean_square + epsilon);
+
+        if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>)
+            invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor);
+
+        for(int n = 0; n < N; ++n)
+        {
+            ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
+            ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
+            auto y                = x * divisor * gamma;
+            y_m_n(m, n)           = ck_tile::type_convert<YDataType>(y);
+        }
+    };
+
+    make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(
+        std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp b/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp
new file mode 100644
index 000000000..e9a398876
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+template <typename XDataType, typename ScaleDataType, typename QXDataType>
+CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>& x_m_n,
+                                                   const HostTensor<ScaleDataType>& scale_m,
+                                                   HostTensor<QXDataType>& qx_m_n)
+{
+    auto f = [&](auto m) {
+        const int N = x_m_n.mDesc.get_lengths()[1];
+
+        for(int n = 0; n < N; ++n)
+        {
+            auto v_x = x_m_n(m, n);
+            // scale = amax / 127 for int8
+            auto v_scale = type_convert<XDataType>(scale_m(m));
+            auto v_qx    = v_x / v_scale;
+            qx_m_n(m, n) = saturates<QXDataType>{}(v_qx);
+        }
+    };
+
+    make_ParallelTensorFunctor(f,
+                               scale_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
new file mode 100644
index 000000000..eb06fea2d
--- /dev/null
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
new file mode 100644
index 000000000..4a0e29035
--- /dev/null
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+// host side args
+struct AddRmsnorm2dRdquantFwdHostArgs
+{
+    const void* p_a;
+    const void* p_b;
+    const void* p_gamma;
+
+    void* p_x;
+    void* p_yscale;
+    void* p_qy;
+
+    float epsilon;
+
+    index_t m;
+    index_t n;
+    index_t stride; // row_stride
+};
+
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct AddRmsnorm2dRdquantFwd
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+
+    using ADataType       = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+    using GammaDataType   = remove_cvref_t<typename Problem::GammaDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using YScaleDataType  = remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = remove_cvref_t<typename Problem::QYDataType>;
+
+    static constexpr bool kSaveX = Problem::kSaveX;
+
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kThreePass = Problem::kThreePass;
+
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    struct Kargs
+    {
+        const void* p_a;
+        const void* p_b;
+        const void* p_gamma;
+
+        void* p_x;
+        void* p_yscale;
+        void* p_qy;
+
+        float epsilon;
+
+        index_t m;
+        index_t n;
+        index_t stride; // row_stride
+    };
+    using Hargs = AddRmsnorm2dRdquantFwdHostArgs;
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        return Kargs{hargs.p_a,
+                     hargs.p_b,
+                     hargs.p_gamma,
+                     hargs.p_x,
+                     hargs.p_yscale,
+                     hargs.p_qy,
+                     hargs.epsilon,
+                     hargs.m,
+                     hargs.n,
+                     hargs.stride};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        return integer_divide_ceil(hargs.m, Block_M);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kSaveX) n += "_x";
+            if (kThreePass) n += "_2p";
+            return n; }();
+
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("add_rmsnorm2d_rdquant_fwd_") + _SS_(t2s<XDataType>::name) + "_" +
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const auto iM = get_block_id() * Block_M;
+
+        const auto a_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const ADataType*>(kargs.p_a),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        const auto b_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const BDataType*>(kargs.p_b),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        const auto gamma_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const GammaDataType*>(kargs.p_gamma),
+                make_tuple(kargs.n),
+                make_tuple(1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadM>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+        }();
+
+        auto x_window = [&]() {
+            if constexpr(kSaveX)
+            {
+                const auto tmp2_ = [&]() {
+                    const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<XDataType*>(kargs.p_x),
+                        make_tuple(kargs.m, kargs.n),
+                        make_tuple(kargs.stride, 1),
+                        number<Vector_N>{},
+                        number<1>{});
+
+                    return pad_tensor_view(tmp_,
+                                           make_tuple(number<Block_M>{}, number<Block_N>{}),
+                                           sequence<kPadM, kPadN>{});
+                }();
+                return make_tile_window(
+                    tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+            }
+            else
+                return make_null_tile_window(make_tuple(number<Block_M>{}, number<Block_N>{}));
+        }();
+
+        auto yscale_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YScaleDataType*>(kargs.p_yscale),
+                make_tuple(kargs.m),
+                make_tuple(1),
+                number<1>{});
+
+            auto tmp2_ = pad_tensor_view(tmp_, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+            return make_tile_window(tmp2_, make_tuple(number<Block_M>{}), {iM});
+        }();
+
+        auto qy_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<QYDataType*>(kargs.p_qy),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        __shared__ char smem[GetSmemSize()];
+
+        Pipeline{}(a_window,
+                   b_window,
+                   gamma_window,
+                   x_window,
+                   yscale_window,
+                   qy_window,
+                   static_cast<const ComputeDataType>(kargs.epsilon),
+                   kargs.n,
+                   smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
new file mode 100644
index 000000000..a17c53c73
--- /dev/null
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+/*
+// clang-format off
+
+4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
+
+                         Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
+        +<----------------------< Repeat_N(2)>--------------------->+
+        |                                                           |
+        +<--    <WarpPerBlock_N(2)>  -->+
+            Warp_N
+        +--------------+--------------+--------------+--------------+----+----------------+
+ Warp_M | wrap_0       | wrap_1       |                             |    ^                ^
+        +--------------+--------------+                             |   <WarpPerBlock_M(2)> |
+        | wrap_2       | wrap_3       |                             |    v
+        +--------------+--------------+--------------+--------------+----+           Block_M
+        |                             |                             |
+        +                             +                             |
+        |                             |                             |                     v
+        +--------------+--------------+--------------+--------------+                     +
+
+        each Warp-tile (e.g 16 thrd per row)
+
+         Vector_N (contiguous pixels each thrd holds along N, or vector size)
+        +-----------+-----------+-----------+-----------+-----------+
+        | thrd_0    | thrd_1    | thrd_2    | thrd_3    | ...         Vector_M
+        +-----------+-----------+-----------+-----------+-----------+
+        | thrd_16   | thrd_17   | thrd_18   | thrd_19   | ...
+        +-----------+-----------+-----------+-----------+-----------+
+// clang-format on
+*/
+template <typename BlockTile_,    // block size, seq<M, N>
+          typename WarpPerBlock_, // num warps along seq<M, N>
+          typename WarpTile_,     // warp size, seq<M, N>
+          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
+          index_t BlockSize_ =
+              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
+struct AddRmsnorm2dRdquantShape
+{
+    // block size
+    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
+
+    // num warps along seq<M, N>, within each block
+    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
+
+    // warp size
+    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
+
+    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
+    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
+    // repeat of each thread along seq<M, N>
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    // vector size along seq<M, N>
+    static constexpr index_t Vector_M = Vector_::at(number<0>{});
+    static constexpr index_t Vector_N = Vector_::at(number<1>{});
+
+    static_assert(Warp_M % Vector_M == 0);
+    static_assert(Warp_N % Vector_N == 0);
+    // num of threads along seq<M, N>, within each warp
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+
+    static constexpr index_t BlockSize = BlockSize_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
new file mode 100644
index 000000000..73ba633b1
--- /dev/null
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+
+namespace ck_tile {
+
+struct AddRmsnorm2dRdquantFwdPipelineDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeABXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<0, 1>, sequence<1, 2>>,
+                sequence<1, 1>,
+                sequence<0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::BlockShape>;
+
+            using block_reduce2d = BlockReduce2d<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::ComputeDataType>(
+                    MakeABXBlockTileDistribution<Problem>()));
+            using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile<x_block_tile>());
+
+            return GetBlockReduce2dCrossWarpSync<Problem>().template GetSmemSize<y_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
new file mode 100644
index 000000000..12a15938a
--- /dev/null
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = AddRmsnorm2dRdquantFwdPipelineDefaultPolicy>
+struct AddRmsnorm2dRdquantFwdPipelineOnePass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using ADataType       = ck_tile::remove_cvref_t<typename Problem::ADataType>;
+    using BDataType       = ck_tile::remove_cvref_t<typename Problem::BDataType>;
+    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+
+    static constexpr bool kHasGamma = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveX    = Problem::kSaveX;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
+    static constexpr bool kPadN = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_op"; // block per row
+        else
+            return "wpr_op"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename AWindow,
+              typename BWindow,
+              typename GammaWindow,
+              typename XWindow,
+              typename YScaleWindow,
+              typename QYWindow>
+    CK_TILE_DEVICE auto operator()(const AWindow& a_window_,
+                                   const BWindow& b_window_,
+                                   const GammaWindow& gamma_window_,
+                                   XWindow& x_window,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ComputeDataType epsilon,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        const auto a_window =
+            make_tile_window(a_window_, Policy::template MakeABXBlockTileDistribution<Problem>());
+        const auto b_window =
+            make_tile_window(b_window_, Policy::template MakeABXBlockTileDistribution<Problem>());
+        const auto gamma_window = make_tile_window(
+            gamma_window_, Policy::template MakeGammaBlockTileDistribution<Problem>());
+
+        auto reduce_square_sum_func = ReduceOp::SquareAdd{};
+        auto reduce_sum_func        = ReduceOp::Add{};
+        auto reduce_absmax_func     = ReduceOp::AbsMax{};
+        auto reduce_max_func        = ReduceOp::Max{};
+        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        const auto a     = load_tile(a_window);
+        const auto b     = load_tile(b_window);
+        const auto gamma = load_tile(gamma_window);
+
+        auto x = tile_elementwise_in(
+            [&](const auto& a_, const auto& b_) {
+                return type_convert<ComputeDataType>(a_) + type_convert<ComputeDataType>(b_);
+            },
+            a,
+            b);
+
+        if constexpr(kSaveX)
+            store_tile(x_window, cast_tile<XDataType>(x));
+
+        // compute mean square, each-thread->cross-lane->cross-warp
+        auto square_sum = block_reduce2d(
+            x, reduce_square_sum_func.GetIdentityValue<ComputeDataType>(), reduce_square_sum_func);
+        block_reduce2d_sync(square_sum, reduce_sum_func);
+        block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func);
+
+        auto inv_rms = tile_elementwise_in(
+            [&](const auto& v_) {
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ / row_size + epsilon));
+            },
+            square_sum);
+
+        // rmsnorm computation
+        auto y = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+        sweep_tile(y, [&, inv_rms_ = inv_rms](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+            const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+
+            const auto x_ = type_convert<ComputeDataType>(x[idx]);
+            auto y_       = x_ * inv_rms_[i_idx] * gamma_;
+
+            y(idx) = type_convert<ComputeDataType>(y_);
+        });
+
+        // compute absmax, each-thread->cross-lane->cross-warp
+        auto absmax = block_reduce2d(
+            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+
+        // quantize y to qy
+        auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+        sweep_tile(qy, [&, yscale_ = yscale](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            auto qy_             = y[idx] / yscale_[i_idx];
+            qy(idx)              = saturates<QYDataType>{}(qy_);
+        });
+        store_tile(qy_window, qy);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
new file mode 100644
index 000000000..106e5086b
--- /dev/null
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+// X = A + B, Y = Rmsnorm2d(X), QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
+template <typename ADataType_,
+          typename BDataType_,
+          typename GammaDataType_,
+          typename ComputeDataType_,
+          typename XDataType_,
+          typename YScaleDataType_,
+          typename QYDataType_,
+          typename BlockShape_,
+          bool kPadN_,
+          bool kSaveX_,
+          bool kThreePass_>
+struct AddRmsnorm2dRdquantFwdPipelineProblem
+{
+    using ADataType       = remove_cvref_t<ADataType_>;
+    using BDataType       = remove_cvref_t<BDataType_>;
+    using GammaDataType   = remove_cvref_t<GammaDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using YScaleDataType  = remove_cvref_t<YScaleDataType_>;
+    using QYDataType      = remove_cvref_t<QYDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+
+    static constexpr bool kPadN      = kPadN_;
+    static constexpr bool kSaveX     = kSaveX_;
+    static constexpr bool kThreePass = kThreePass_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
new file mode 100644
index 000000000..0dbb20645
--- /dev/null
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = AddRmsnorm2dRdquantFwdPipelineDefaultPolicy>
+struct AddRmsnorm2dRdquantFwdPipelineThreePass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using ADataType       = ck_tile::remove_cvref_t<typename Problem::ADataType>;
+    using BDataType       = ck_tile::remove_cvref_t<typename Problem::BDataType>;
+    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+
+    static constexpr bool kHasGamma = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveX    = Problem::kSaveX;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
+    static constexpr bool kPadN = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_tp"; // block per row
+        else
+            return "wpr_tp"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename AWindow,
+              typename BWindow,
+              typename GammaWindow,
+              typename XWindow,
+              typename YScaleWindow,
+              typename QYWindow>
+    CK_TILE_DEVICE auto operator()(const AWindow& a_window_,
+                                   const BWindow& b_window_,
+                                   const GammaWindow& gamma_window_,
+                                   XWindow& x_window_,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ComputeDataType epsilon,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        auto a_window =
+            make_tile_window(a_window_, Policy::template MakeABXBlockTileDistribution<Problem>());
+        auto b_window =
+            make_tile_window(b_window_, Policy::template MakeABXBlockTileDistribution<Problem>());
+        auto x_window = [&]() {
+            if constexpr(kSaveX)
+                return make_tile_window(x_window_,
+                                        Policy::template MakeABXBlockTileDistribution<Problem>());
+            else
+                return x_window_;
+        }();
+        auto gamma_window = make_tile_window(
+            gamma_window_, Policy::template MakeGammaBlockTileDistribution<Problem>());
+
+        auto reduce_square_sum_func = ReduceOp::SquareAdd{};
+        auto reduce_sum_func        = ReduceOp::Add{};
+        auto reduce_absmax_func     = ReduceOp::AbsMax{};
+        auto reduce_max_func        = ReduceOp::Max{};
+        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+
+        using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(a_window)));
+        auto square_sum   = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto a = load_tile(a_window);
+            const auto b = load_tile(b_window);
+
+            auto x = tile_elementwise_in(
+                [&](const auto& a_, const auto& b_) {
+                    return type_convert<ComputeDataType>(a_) + type_convert<ComputeDataType>(b_);
+                },
+                a,
+                b);
+
+            if constexpr(kSaveX)
+                store_tile(x_window, cast_tile<XDataType>(x));
+
+            block_reduce2d(x, square_sum, reduce_square_sum_func);
+            move_tile_window(x_window, {0, Block_N});
+            move_tile_window(a_window, {0, Block_N});
+            move_tile_window(b_window, {0, Block_N});
+        }
+
+        block_reduce2d_sync(square_sum, reduce_sum_func);
+        block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func);
+
+        auto inv_rms = tile_elementwise_in(
+            [&](const auto& v_) {
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ / row_size + epsilon));
+            },
+            square_sum);
+
+        // reverse read x to reuse cache
+        ck_tile::index_t stride_to_right_most_window =
+            row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
+
+        if constexpr(kSaveX)
+            move_tile_window(x_window, {0, -Block_N});
+        else
+        {
+            move_tile_window(a_window, {0, -Block_N});
+            move_tile_window(b_window, {0, -Block_N});
+        }
+        move_tile_window(gamma_window, {stride_to_right_most_window});
+
+        using YTensorType = XTensorType;
+        auto absmax       = block_reduce2d.template MakeYBlockTile<YTensorType>();
+        set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());
+
+        // rmsnorm computation + absmax(threadwise reduce)
+        if constexpr(kSaveX)
+            __syncthreads();
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            auto x = [&]() {
+                if constexpr(kSaveX)
+                {
+                    return load_tile(x_window);
+                }
+                else
+                {
+                    const auto a = load_tile(a_window);
+                    const auto b = load_tile(b_window);
+                    return tile_elementwise_in(
+                        [&](const auto& a_, const auto& b_) {
+                            return type_convert<ComputeDataType>(a_) +
+                                   type_convert<ComputeDataType>(b_);
+                        },
+                        a,
+                        b);
+                }
+            }();
+
+            auto gamma = load_tile(gamma_window);
+            auto y     = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+
+            sweep_tile(y, [&](auto idx) {
+                constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+                constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+                const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+
+                const auto x_ = type_convert<ComputeDataType>(x[idx]);
+                auto y_       = x_ * inv_rms[i_idx] * gamma_;
+
+                y(idx) = type_convert<ComputeDataType>(y_);
+            });
+
+            block_reduce2d(y, absmax, reduce_absmax_func);
+
+            if constexpr(kSaveX)
+                move_tile_window(x_window, {0, -Block_N});
+            else
+            {
+                move_tile_window(a_window, {0, -Block_N});
+                move_tile_window(b_window, {0, -Block_N});
+            }
+            move_tile_window(gamma_window, {-Block_N});
+        }
+
+        // compute absmax, cross-lane->cross-warp
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+
+        // quantize y to qy
+        // recompute rmsnorm, try to save y in the future
+        if constexpr(kSaveX)
+            move_tile_window(x_window, {0, Block_N});
+        else
+        {
+            move_tile_window(a_window, {0, Block_N});
+            move_tile_window(b_window, {0, Block_N});
+        }
+        move_tile_window(gamma_window, {Block_N});
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            auto x = [&]() {
+                if constexpr(kSaveX)
+                {
+                    return load_tile(x_window);
+                }
+                else
+                {
+                    const auto a = load_tile(a_window);
+                    const auto b = load_tile(b_window);
+                    return tile_elementwise_in(
+                        [&](const auto& a_, const auto& b_) {
+                            return type_convert<ComputeDataType>(a_) +
+                                   type_convert<ComputeDataType>(b_);
+                        },
+                        a,
+                        b);
+                }
+            }();
+
+            auto gamma = load_tile(gamma_window);
+            auto y     = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+            auto qy    = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+
+            sweep_tile(y, [&](auto idx) {
+                constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+                constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+                const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+
+                const auto x_ = type_convert<ComputeDataType>(x[idx]);
+                auto y_       = x_ * inv_rms[i_idx] * gamma_;
+                auto qy_      = y_ / yscale[i_idx];
+                qy(idx)       = saturates<QYDataType>{}(qy_);
+            });
+
+            store_tile(qy_window, qy);
+
+            if constexpr(kSaveX)
+                move_tile_window(x_window, {0, Block_N});
+            else
+            {
+                move_tile_window(a_window, {0, Block_N});
+                move_tile_window(b_window, {0, Block_N});
+            }
+            move_tile_window(gamma_window, {Block_N});
+            move_tile_window(qy_window, {0, Block_N});
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index bf002141b..c767a472a 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -35,9 +35,9 @@ struct Layernorm2dFwdPipelineOnePass
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
-            return "bpr"; // block per row
+            return "bpr_op"; // block per row
         else
-            return "wpr"; // warp per row
+            return "wpr_op"; // warp per row
     }();
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index db094ac2a..e35d02e70 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -35,9 +35,9 @@ struct Layernorm2dFwdPipelineTwoPass
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
-            return "bpr"; // block per row
+            return "bpr_tp"; // block per row
         else
-            return "wpr"; // warp per row
+            return "wpr_tp"; // warp per row
     }();
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
@@ -118,8 +118,6 @@ struct Layernorm2dFwdPipelineTwoPass
         ck_tile::index_t stride_to_right_most_window =
             row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
 
-        // x_window.foo();
-        // gamma_window.foo();
         move_tile_window(x_window, {0, -Block_N});
         move_tile_window(gamma_window, {stride_to_right_most_window});
         move_tile_window(beta_window, {stride_to_right_most_window});
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index a5ba745d2..fe2d24044 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -4,4 +4,7 @@
 #pragma once
 
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 51d55235e..d9df949cf 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core.hpp"
 #include <tuple>
 
+// This file is not support cross warp reduce
 namespace ck_tile {
 
 /*
@@ -15,8 +16,8 @@ namespace ck_tile {
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
 template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
-                                           const ReduceFunc& reduce_func,
-                                           bool_constant<WithBroadcast> = {})
+                                                          const ReduceFunc& reduce_func,
+                                                          bool_constant<WithBroadcast> = {})
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -115,7 +116,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
  */
 template <typename AccDistributedTensor_, typename ReduceFunc>
 CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor,
-                                               const ReduceFunc& reduce_func)
+                                                              const ReduceFunc& reduce_func)
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -174,9 +175,9 @@ template <typename AccDistributedTensor_,
           index_t... InReduceDims,
           typename ReduceFunc>
 CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor,
-                                      const InDistributedTensor_& in_tensor,
-                                      sequence<InReduceDims...>,
-                                      const ReduceFunc& reduce_func)
+                                                     const InDistributedTensor_& in_tensor,
+                                                     sequence<InReduceDims...>,
+                                                     const ReduceFunc& reduce_func)
 {
     constexpr auto I0 = number<0>{};
     constexpr auto I1 = number<1>{};
@@ -249,9 +250,9 @@ template <typename AccDataType_,
           typename ReduceFunc,
           typename InDataType_>
 CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
-                                      sequence<InReduceDims...> in_reduce_dims,
-                                      const ReduceFunc& reduce_func,
-                                      const InDataType_& reduce_init)
+                                                     sequence<InReduceDims...> in_reduce_dims,
+                                                     const ReduceFunc& reduce_func,
+                                                     const InDataType_& reduce_init)
 {
     using InDataType  = typename InDistributedTensor_::DataType;
     using AccDataType = remove_cvref_t<AccDataType_>;
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
new file mode 100644
index 000000000..beb8c718e
--- /dev/null
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = void>
+struct BlockReduce2d
+{
+    // in-thread reduction
+    using Problem         = remove_cvref_t<Problem_>;
+    using XDataType       = typename Problem::XDataType;
+    using ComputeDataType = typename Problem::ComputeDataType;
+
+    CK_TILE_DEVICE constexpr BlockReduce2d() {}
+
+    template <typename XDistributedTensor_, typename YDistributedTensor_, typename ReduceFunc>
+    CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
+                                   YDistributedTensor_& y_tensor,
+                                   const ReduceFunc& reduce_func)
+    {
+        constexpr auto I0 = number<0>{};
+        constexpr auto I1 = number<1>{};
+
+        constexpr auto spans = XDistributedTensor_::get_distributed_spans();
+
+        // FIXME: hard coded to reduce 2nd axis
+        sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
+            constexpr auto y_dstr_idx = make_tuple(dstr_idx_i0);
+
+            auto y = y_tensor[y_dstr_idx];
+
+            sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
+                constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1);
+                const auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
+
+                y = reduce_func(y, x);
+            });
+
+            y_tensor(y_dstr_idx) = y;
+        });
+    }
+
+    template <typename XDistributedTensor_>
+    CK_TILE_DEVICE static auto MakeYBlockTile()
+    {
+        static_assert(std::is_same_v<XDataType, typename XDistributedTensor_::DataType>, "wrong!");
+
+        // FIXME: hard coded to reduce 2nd axis
+        constexpr auto reduce_dims = sequence<1>{};
+
+        constexpr auto dstr =
+            make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
+                XDistributedTensor_::get_tile_distribution()
+                    .get_static_tile_distribution_encoding(),
+                reduce_dims));
+
+        auto tensor = make_static_distributed_tensor<ComputeDataType>(dstr);
+
+        return tensor;
+    }
+
+    template <typename XDistributedTensor_, typename ReduceFunc>
+    CK_TILE_DEVICE auto operator()(const XDistributedTensor_& x_tensor,
+                                   const ComputeDataType& reduce_init,
+                                   const ReduceFunc& reduce_func)
+    {
+        auto y_tensor = MakeYBlockTile<XDistributedTensor_>();
+        set_tile(y_tensor, reduce_init);
+        (*this)(x_tensor, y_tensor, reduce_func);
+
+        return y_tensor;
+    }
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct BlockReduce2dSync
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    template <typename YDistributedTensor_, typename ReduceFunc>
+    CK_TILE_DEVICE void operator()(YDistributedTensor_& y_tensor, const ReduceFunc& reduce_func)
+    {
+        using Dstr             = typename YDistributedTensor_::StaticTileDistribution;
+        using DstrEncode       = typename Dstr::DstrEncode;
+        using DstrEncodeDetail = typename DstrEncode::detail;
+
+        constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
+        constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+        constexpr index_t idim_p_lane = NDimP - 1;
+
+        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
+        // const auto rs_idx =
+        //     y_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
+
+        constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();
+
+        // loop over thread data
+        static_for<0, thread_buf_size, 1>{}([&](auto i) {
+            auto v_local = y_tensor.get_thread_buffer()[i];
+
+            // cross-lane reduce for replication
+            // only reduce on R dimension correspond to lane
+            // (lane id maps to this R dimension)
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                // FIXME: nasty to use does_p_own_r_
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+
+                    constexpr index_t lid_over_rid_derivative =
+                        DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
+
+                    static_assert(is_power_of_two_integer(r_length),
+                                  "wrong! only support power of 2 reduction");
+
+                    constexpr index_t nstage = integer_log2_floor(r_length);
+
+                    // reduction sweep forward
+                    static_for<0, nstage, 1>{}([&](auto istage) {
+                        // xor
+                        index_t src_lane =
+                            (__lane_id()) ^
+                            (number<lid_over_rid_derivative << istage.value>{}.value);
+
+                        // pull data from remote lane
+                        const auto v_remote = warp_shuffle(v_local, src_lane);
+
+                        // reduce
+                        v_local = reduce_func(v_local, v_remote);
+                    });
+                }
+            });
+
+            // TODO - Do we need to broadcast to other lane?
+            y_tensor.get_thread_buffer()(i) = v_local;
+        });
+    }
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct BlockReduce2dCrossWarpSync
+{
+    using Problem    = remove_cvref_t<Problem_>;
+    using BlockShape = typename Problem::BlockShape;
+
+    template <typename YDistributedTensor_>
+    CK_TILE_DEVICE static constexpr index_t GetReduceWarps()
+    {
+        constexpr index_t num_reduce_warps = [&]() {
+            using Dstr             = typename YDistributedTensor_::StaticTileDistribution;
+            using DstrEncode       = typename Dstr::DstrEncode;
+            using DstrEncodeDetail = typename DstrEncode::detail;
+
+            constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+            constexpr index_t idim_p_warp = 0;
+
+            index_t len_ = 1;
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_warp][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+                    len_ *= r_length;
+                }
+            });
+            return len_;
+        }();
+        return num_reduce_warps;
+    }
+
+    // return in byte
+    template <typename YDistributedTensor_>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        using DataType = typename YDistributedTensor_::DataType;
+        // constexpr auto num_reduce_warps = GetReduceWarps<YDistributedTensor_>();
+
+        constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();
+
+        // we need to store all data from every wave into smem
+        // e.g. 2x2 reduce along N
+        //     -------------> reduce N
+        //    | w0 | w1 |   ___>      | w01 |
+        //    | w2 | w3 |             | w23 |
+        //
+        //   -> store data from every wave into LDS
+        //
+        //
+        //     -------------> reduce N
+        //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
+        //
+        //   -> also store data from every wave into LDS
+        constexpr index_t num_warps = BlockShape::BlockSize / warpSize;
+        return num_warps * thread_buf_size * sizeof(DataType);
+    }
+
+    template <typename YDistributedTensor_, typename ReduceFunc>
+    CK_TILE_DEVICE void
+    operator()(YDistributedTensor_& y_tensor, void* smem, const ReduceFunc& reduce_func)
+    {
+        using DataType = typename YDistributedTensor_::DataType;
+
+        constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();
+
+        DataType* smem_ptr              = reinterpret_cast<DataType*>(smem);
+        const index_t lane_id           = get_lane_id();
+        const index_t warp_id           = get_warp_id();
+        constexpr auto num_reduce_warps = GetReduceWarps<YDistributedTensor_>();
+        constexpr index_t num_warps     = BlockShape::BlockSize / warpSize;
+        const index_t smem_offset       = warp_id;
+
+        // skip if nonthing to do
+        if constexpr(num_reduce_warps == 1)
+            return;
+
+        // store into smem only for lane-0 within one warp
+        if(lane_id == 0)
+        {
+            static_for<0, thread_buf_size, 1>{}([&](auto i) {
+                smem_ptr[smem_offset + i * num_warps] = y_tensor.get_thread_buffer()[i];
+            });
+        }
+        block_sync_lds();
+
+        // load from smem. here we let everythread to do compute :)
+        index_t local_warp_id = warp_id / num_reduce_warps;
+        index_t local_smem_os = local_warp_id * num_reduce_warps;
+        DataType all_scratch[thread_buf_size * num_reduce_warps];
+        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
+            static_for<0, num_reduce_warps, 1>{}([&](auto i_1) {
+                all_scratch[i_0 * num_reduce_warps + i_1] =
+                    smem_ptr[i_0 * num_warps + local_smem_os + i_1];
+            });
+        });
+        block_sync_lds(); // TODO: we don't need sync here
+
+        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
+            // TODO: use descriptor for this
+            auto v_local = all_scratch[i_0 * num_reduce_warps];
+
+            // further reduce mean/var
+            static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) {
+                constexpr auto i_1      = number<i_1_n1 + 1>{};
+                const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1];
+
+                // reduce
+                v_local = reduce_func(v_local, v_remote);
+            });
+
+            y_tensor.get_thread_buffer()(i_0) = v_local;
+        });
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
new file mode 100644
index 000000000..3c547242d
--- /dev/null
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+
+namespace ck_tile {
+
+struct BlockReduce2dDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::BlockShape>;
+
+            using block_reduce2d = BlockReduce2d<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::XDataType>(
+                    MakeXBlockTileDistribution<Problem>()));
+            using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile<x_block_tile>());
+
+            return GetBlockReduce2dCrossWarpSync<Problem>().template GetSmemSize<y_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp
new file mode 100644
index 000000000..b75f4f076
--- /dev/null
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType_, typename ComputeDataType_, typename BlockShape_>
+struct BlockReduce2dProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp
new file mode 100644
index 000000000..98c60f1b5
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp"
+#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
new file mode 100644
index 000000000..99084a25e
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+// host side args
+struct Rmsnorm2dFwdHostArgs
+{
+    const void* p_x;
+    const void* p_gamma;
+
+    void* p_y;
+    void* p_invRms;
+
+    float epsilon;
+
+    index_t m;
+    index_t n;
+    index_t stride; // row_stride
+};
+
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct Rmsnorm2dFwd
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using GammaDataType   = remove_cvref_t<typename Problem::GammaDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = remove_cvref_t<typename Problem::YDataType>;
+    using InvRmsDataType  = remove_cvref_t<typename Problem::InvRmsDataType>;
+
+    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, null_type>;
+    static constexpr bool kSaveInvRms = Problem::kSaveInvRms;
+
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kTwoPass   = Problem::kTwoPass;
+
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    struct Kargs
+    {
+        const void* p_x;
+        const void* p_gamma;
+
+        void* p_y;
+        void* p_invRms;
+
+        float epsilon;
+
+        index_t m;
+        index_t n;
+        index_t stride; // row_stride
+    };
+    using Hargs = Rmsnorm2dFwdHostArgs;
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        return Kargs{hargs.p_x,
+                     hargs.p_gamma,
+                     hargs.p_y,
+                     hargs.p_invRms,
+                     hargs.epsilon,
+                     hargs.m,
+                     hargs.n,
+                     hargs.stride};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        return (hargs.m + Block_M - 1) / Block_M;
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kSaveInvRms) n += "_rms";
+            if (kTwoPass) n += "_2p";
+            return n; }();
+
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("rmsnorm2d_fwd_") + _SS_(t2s<XDataType>::name) + "_" +
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const auto iM = get_block_id() * Block_M;
+
+        const auto x_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XDataType*>(kargs.p_x),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        const auto gamma_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const GammaDataType*>(kargs.p_gamma),
+                make_tuple(kargs.n),
+                make_tuple(1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadM>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+        }();
+
+        auto y_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YDataType*>(kargs.p_y),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        auto inv_rms_window = [&]() {
+            if constexpr(kSaveInvRms)
+            {
+                const auto inv_rms_m = [&]() {
+                    const auto inv_rms_dram_naive =
+                        make_naive_tensor_view_packed<address_space_enum::global>(
+                            static_cast<InvRmsDataType*>(kargs.p_invRms),
+                            make_tuple(kargs.m),
+                            number<1>{});
+
+                    return pad_tensor_view(
+                        inv_rms_dram_naive, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+                }();
+                return make_tile_window(inv_rms_m, make_tuple(number<Block_M>{}), {iM});
+            }
+            else
+                return make_null_tile_window(make_tuple(number<Block_M>{}));
+        }();
+
+        __shared__ char smem[GetSmemSize()];
+
+        Pipeline{}(x_window,
+                   gamma_window,
+                   y_window,
+                   inv_rms_window,
+                   static_cast<const ComputeDataType>(kargs.epsilon),
+                   kargs.n,
+                   smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
new file mode 100644
index 000000000..fb484a106
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+/*
+// clang-format off
+
+4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
+
+                         Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
+        +<----------------------< Repeat_N(2)>--------------------->+
+        |                                                           |
+        +<--    <WarpPerBlock_N(2)>  -->+
+            Warp_N
+        +--------------+--------------+--------------+--------------+----+----------------+
+ Warp_M | wrap_0       | wrap_1       |                             |    ^                ^
+        +--------------+--------------+                             |   <WarpPerBlock_M(2)> |
+        | wrap_2       | wrap_3       |                             |    v
+        +--------------+--------------+--------------+--------------+----+           Block_M
+        |                             |                             |
+        +                             +                             |
+        |                             |                             |                     v
+        +--------------+--------------+--------------+--------------+                     +
+
+        each Warp-tile (e.g 16 thrd per row)
+
+         Vector_N (contiguous pixels each thrd holds along N, or vector size)
+        +-----------+-----------+-----------+-----------+-----------+
+        | thrd_0    | thrd_1    | thrd_2    | thrd_3    | ...         Vector_M
+        +-----------+-----------+-----------+-----------+-----------+
+        | thrd_16   | thrd_17   | thrd_18   | thrd_19   | ...
+        +-----------+-----------+-----------+-----------+-----------+
+// clang-format on
+*/
+template <typename BlockTile_,    // block size, seq<M, N>
+          typename WarpPerBlock_, // num warps along seq<M, N>
+          typename WarpTile_,     // warp size, seq<M, N>
+          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
+          index_t BlockSize_ =
+              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
+struct Rmsnorm2dShape
+{
+    // block size
+    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
+
+    // num warps along seq<M, N>, within each block
+    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
+
+    // warp size
+    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
+
+    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
+    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
+    // repeat of each thread along seq<M, N>
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    // vector size along seq<M, N>
+    static constexpr index_t Vector_M = Vector_::at(number<0>{});
+    static constexpr index_t Vector_N = Vector_::at(number<1>{});
+
+    static_assert(Warp_M % Vector_M == 0);
+    static_assert(Warp_N % Vector_N == 0);
+    // num of threads along seq<M, N>, within each warp
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+
+    static constexpr index_t BlockSize = BlockSize_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
new file mode 100644
index 000000000..e4814cf45
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+
+namespace ck_tile {
+
+struct Rmsnorm2dFwdPipelineDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<0, 1>, sequence<1, 2>>,
+                sequence<1, 1>,
+                sequence<0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::BlockShape>;
+
+            using block_reduce2d = BlockReduce2d<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::XDataType>(
+                    MakeXBlockTileDistribution<Problem>()));
+            using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile<x_block_tile>());
+
+            return GetBlockReduce2dCrossWarpSync<Problem>().template GetSmemSize<y_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
new file mode 100644
index 000000000..68cfe4282
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = Rmsnorm2dFwdPipelineDefaultPolicy>
+struct Rmsnorm2dFwdPipelineOnePass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+    using InvRmsDataType  = ck_tile::remove_cvref_t<typename Problem::InvRmsDataType>;
+
+    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveInvRms = Problem::kSaveInvRms;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockRmsnorm2dFwdProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_op"; // block per row
+        else
+            return "wpr_op"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow, typename GammaWindow, typename YWindow, typename InvRmsWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const GammaWindow& gamma_window_,
+                                   YWindow& y_window,
+                                   InvRmsWindow& inv_rms_window,
+                                   ComputeDataType epsilon,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        const auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        const auto gamma_window = make_tile_window(
+            gamma_window_, Policy::template MakeGammaBlockTileDistribution<Problem>());
+
+        auto reduce_square_sum_func = ReduceOp::SquareAdd{};
+        auto reduce_sum_func        = ReduceOp::Add{};
+        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        const auto x = load_tile(x_window);
+        // load gamma (TODO: support no gamma?)
+        const auto gamma = load_tile(gamma_window);
+
+        // compute mean square each-thread->cross-lane->cross-warp
+        auto square_sum = block_reduce2d(
+            x, reduce_square_sum_func.GetIdentityValue<ComputeDataType>(), reduce_square_sum_func);
+        block_reduce2d_sync(square_sum, reduce_sum_func);
+        block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func);
+
+        // compute inv-rms
+        auto inv_rms = tile_elementwise_in(
+            [&](const auto& v_) {
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ / row_size + epsilon));
+            },
+            square_sum);
+
+        if constexpr(kSaveInvRms)
+            store_tile(inv_rms_window, cast_tile<InvRmsDataType>(inv_rms));
+
+        // rmsnorm computation
+        auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
+        sweep_tile(y, [&, inv_rms_ = inv_rms](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+            const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+
+            const auto x_ = type_convert<ComputeDataType>(x[idx]);
+            auto y_       = x_ * inv_rms_[i_idx] * gamma_;
+
+            y(idx) = type_convert<YDataType>(y_);
+        });
+        store_tile(y_window, y);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
new file mode 100644
index 000000000..87cab3463
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType_,
+          typename GammaDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename InvRmsDataType_,
+          typename BlockShape_,
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kTwoPass_>
+struct Rmsnorm2dFwdPipelineProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using GammaDataType   = remove_cvref_t<GammaDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YDataType       = remove_cvref_t<YDataType_>;
+    using InvRmsDataType  = remove_cvref_t<InvRmsDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+
+    static constexpr bool kPadN       = kPadN_;
+    static constexpr bool kSaveInvRms = kSaveInvRms_;
+    static constexpr bool kTwoPass    = kTwoPass_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
new file mode 100644
index 000000000..a892df6bd
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = Rmsnorm2dFwdPipelineDefaultPolicy>
+struct Rmsnorm2dFwdPipelineTwoPass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+    using InvRmsDataType  = ck_tile::remove_cvref_t<typename Problem::InvRmsDataType>;
+
+    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveInvRms = Problem::kSaveInvRms;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockRmsnorm2dFwdProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_tp"; // block per row
+        else
+            return "wpr_tp"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow, typename GammaWindow, typename YWindow, typename InvRmsWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const GammaWindow& gamma_window_,
+                                   YWindow& y_window,
+                                   InvRmsWindow& inv_rms_window,
+                                   ComputeDataType epsilon,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto gamma_window = make_tile_window(
+            gamma_window_, Policy::template MakeGammaBlockTileDistribution<Problem>());
+
+        // Problem::BlockShape
+        static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+
+        auto reduce_square_sum_func = ReduceOp::SquareAdd{};
+        auto reduce_sum_func        = ReduceOp::Add{};
+        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(load_tile(x_window));
+        auto square_sum   = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            block_reduce2d(x, square_sum, reduce_square_sum_func);
+            move_tile_window(x_window, {0, Block_N});
+        }
+
+        block_reduce2d_sync(square_sum, reduce_sum_func);
+        block_reduce2d_cross_warp_sync(square_sum, smem, reduce_sum_func);
+
+        // compute inv-rms
+        auto inv_rms = tile_elementwise_in(
+            [&](const auto& v_) {
+                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ / row_size + epsilon));
+            },
+            square_sum);
+
+        if constexpr(kSaveInvRms)
+            store_tile(inv_rms_window, cast_tile<InvRmsDataType>(inv_rms));
+
+        // reverse read x to reuse cache
+        ck_tile::index_t stride_to_right_most_window =
+            row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
+
+        move_tile_window(x_window, {0, -Block_N});
+        move_tile_window(gamma_window, {stride_to_right_most_window});
+        move_tile_window(y_window, {0, stride_to_right_most_window});
+
+        // rmsnorm computation
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            // load gamma/beta (TODO: support no gamma/beta?)
+            const auto gamma = load_tile(gamma_window);
+
+            auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
+
+            sweep_tile(y, [&, inv_rms_ = inv_rms](auto idx) {
+                constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+                constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+                const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+
+                const auto x_ = type_convert<ComputeDataType>(x[idx]);
+                auto y_       = x_ * inv_rms_[i_idx] * gamma_;
+
+                y(idx) = type_convert<YDataType>(y_);
+            });
+
+            store_tile(y_window, y);
+
+            move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(gamma_window, {-Block_N});
+            move_tile_window(y_window, {0, -Block_N});
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp
index 55d55402d..623e1e16d 100644
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
@@ -276,8 +276,8 @@ struct BlockWelfordCrossWarpSync
         fp32x4_t all_scratch[thread_buf_size * num_reduce_warps];
         static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
             static_for<0, num_reduce_warps, 1>{}([&](auto i_1) {
-                all_scratch[i_0 * num_warps + i_1] =
-                    smem_ptr[i_0 * num_reduce_warps + local_smem_os + i_1];
+                all_scratch[i_0 * num_reduce_warps + i_1] =
+                    smem_ptr[i_0 * num_warps + local_smem_os + i_1];
             });
         });
         block_sync_lds(); // TODO: we don't need sync here
@@ -286,7 +286,7 @@ struct BlockWelfordCrossWarpSync
 
         static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
             // TODO: use descriptor for this
-            auto v_local       = all_scratch[i_0 * num_warps];
+            auto v_local       = all_scratch[i_0 * num_reduce_warps];
             auto v_local_mean  = bit_cast<DataType>(v_local[0]);
             auto v_local_var   = bit_cast<DataType>(v_local[1]);
             auto v_local_count = bit_cast<int>(v_local[2]);
@@ -294,7 +294,7 @@ struct BlockWelfordCrossWarpSync
             // further reduce mean/var
             static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) {
                 constexpr auto i_1        = number<i_1_n1 + 1>{};
-                const fp32x4_t v_remote   = all_scratch[i_0 * num_warps + i_1];
+                const fp32x4_t v_remote   = all_scratch[i_0 * num_reduce_warps + i_1];
                 const auto v_remote_mean  = bit_cast<DataType>(v_remote[0]);
                 const auto v_remote_var   = bit_cast<DataType>(v_remote[1]);
                 const auto v_remote_count = bit_cast<int>(v_remote[2]);
-- 
GitLab


From 24d996aae11c45430571ebc1ee428dc67fd2d91b Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 30 Oct 2024 10:05:15 +0100
Subject: [PATCH 026/153] [CK-Tile] Universal gemm memory bound pipeline
 (#1558)

* CK-Tile GEMM with memory bound pipeline.

* Memory bound gemm pipeline.

* Fix not closed namespace.

* Block gemm mem pipeline draft.

* Do not use ck_tile:: within ck_tile namespace.

* Refactoring & Move Layout info to pipeline problem.

* Get hot loop and TailNum information before lunching kernel.

* Fixes in pipeline.

* Add comment to load_tile_raw and change variable naming style.

* Few small changes & formatting.

* Do not use macro.

* Add gtests.

* Use AccDataType for Output of MFMA instruction.

* Formatting.

* Refactor gemm examples.

* Switch over to current block gemm.

* Use currently available pipeline policy.

* Refactoring and review comment.s

* Fixes after merge.

* Add missing include.

* Add load tile overload which accepts output tensor as parameter.

* This give 8% perf boost at the cost of using more registers.

* Rename example.

* Small changes.

* Fix compilation err and lower K.

* Support different layouts for A/B

* Fix vector size for different layouts.

* Rename Alignment into VectorSize

* Unblock tests.
---
 example/ck_tile/03_gemm/CMakeLists.txt        |   4 +-
 example/ck_tile/03_gemm/gemm_basic.cpp        | 366 ++--------------
 example/ck_tile/03_gemm/gemm_basic.hpp        |  32 +-
 example/ck_tile/03_gemm/gemm_mem_pipeline.cpp | 188 ++++++++
 example/ck_tile/03_gemm/run_gemm_example.inc  | 217 +++++++++
 include/ck_tile/core.hpp                      |   1 +
 include/ck_tile/core/tensor/load_tile.hpp     |  27 +-
 include/ck_tile/core/tensor/tile_window.hpp   |  17 +-
 include/ck_tile/core/utility/literals.hpp     |  22 +
 .../ck_tile/host/reference/reference_gemm.hpp |  60 +--
 include/ck_tile/ops/gemm.hpp                  |   2 +
 .../block/block_gemm_areg_bgmem_creg_v1.hpp   |   2 +-
 .../block/block_gemm_asmem_bsmem_creg_v1.hpp  |  30 +-
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 123 +++---
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp |  24 +-
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   | 413 ++++++++++++++++++
 .../gemm_pipeline_ag_bg_cr_scheduler.hpp      |  71 +++
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     |  24 +-
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |  10 +-
 .../gemm_pipeline_agmem_bgmem_creg_v2.hpp     |   6 +-
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |  53 ++-
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |  16 +-
 .../warp/warp_gemm_attribute_mfma_impl.hpp    |  52 +--
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |  58 +--
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/gemm/CMakeLists.txt              |   4 +
 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp  |  29 ++
 .../gemm/test_gemm_mem_pipeline_ut_cases.inc  |  41 ++
 .../gemm/test_gemm_mem_pipeline_util.hpp      | 318 ++++++++++++++
 29 files changed, 1655 insertions(+), 556 deletions(-)
 create mode 100644 example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
 create mode 100644 example/ck_tile/03_gemm/run_gemm_example.inc
 create mode 100644 include/ck_tile/core/utility/literals.hpp
 create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
 create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
 create mode 100644 test/ck_tile/gemm/CMakeLists.txt
 create mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
 create mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp

diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 03fc9c7eb..8ae46cadc 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,2 +1,2 @@
-set(CMAKE_BUILD_TYPE Debug)
-add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
\ No newline at end of file
+add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
+add_executable(tile_example_gemm_mem_pipeline EXCLUDE_FROM_ALL gemm_mem_pipeline.cpp)
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 569afed25..09427217c 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "gemm_basic.hpp"
 #include <hip/hip_runtime.h>
 
 #include <cstring>
@@ -10,51 +9,48 @@
 #include <string>
 #include <tuple>
 
-auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("b", "1", "batch size")
-        .insert("m", "1024", "m dimension")
-        .insert("n", "2048", "n dimension")
-        .insert("k", "64", "k dimension")
-        .insert("stride_a", "0", "Tensor A stride")
-        .insert("stride_b", "0", "Tensor B stride")
-        .insert("stride_c", "0", "Tensor C stride")
-        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
-        .insert("e", "1e-5", "Absolute error tolerance")
-        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
-        .insert("warmup", "10", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_basic.hpp"
 
-template <typename LayoutA,
-          typename LayoutB,
-          typename LayoutC,
-          typename PipelineProblem,
-          typename GemmPipeline,
-          typename GemmShape>
+template <typename ALayout, typename BLayout, typename CLayout>
 float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 {
     // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
     constexpr bool kPadA        = true;
     constexpr bool kPadB        = true;
+    constexpr bool kPadC        = true;
     constexpr bool kTilePermute = false;
+    // The rank and permutation will also be generate out by the CodeGen part.
+    constexpr ck_tile::index_t kOutputRank = 2;
 
     constexpr int kBlockPerCu = 1;
 
-    using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 32;
 
-    // The rank and permutation will also be generate out by the CodeGen part.
-    constexpr ck_tile::index_t kOutputRank = 2;
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
 
     // Whether doing the CShuffle (transpose before the global memory), depending on the output
     // layout.
     constexpr bool CShuffleEpilogue =
-        std::is_same_v<LayoutC, ck_tile::tensor_layout::gemm::ColumnMajor>;
+        std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTilePartitioner<CodegenGemmShape>;
 
     using GemmEpilogue = std::conditional_t<
         CShuffleEpilogue,
@@ -70,14 +66,21 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
                                                                    TilePartitioner::kN>>,
         ck_tile::Default2DEpilogue<
             ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+    using CodegenPipelineProblem = ck_tile::
+        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
+    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy<ALayout, BLayout, CLayout>;
+    using CodegenGemmPipeline =
+        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
     // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
     // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-    using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+    using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
     auto kargs = Kernel::MakeKargs(args.p_a,
                                    args.p_b,
                                    args.p_c,
-                                   args.epsilon,
                                    args.M,
                                    args.N,
                                    args.K,
@@ -88,299 +91,20 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
     const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
     constexpr dim3 blocks = Kernel::BlockSize();
 
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-    return ave_time;
-}
-
-template <typename DataType,
-          typename LayoutA,
-          typename LayoutB,
-          typename LayoutC,
-          typename PipelineProblem,
-          typename GemmPipeline,
-          typename GemmShape>
-float invoke_gemm(ck_tile::DeviceMem& a_buf,
-                  ck_tile::DeviceMem& b_buf,
-                  ck_tile::DeviceMem& c_buf,
-                  const ck_tile::ArgParser& arg_parser)
-{
-
-    std::string data_type = arg_parser.get_str("prec");
-
-    if(data_type != DataTypeTraits<DataType>::name)
-    {
-        std::cerr << "Data type mismatch: expected " << DataTypeTraits<DataType>::name << ", got "
-                  << data_type << std::endl;
-        return -1; // Or handle the error appropriately
-    }
-
-    float epsilon               = arg_parser.get_float("e");
-    ck_tile::index_t batch_size = arg_parser.get_int("b");
-    ck_tile::index_t M          = arg_parser.get_int("m");
-    ck_tile::index_t N          = arg_parser.get_int("n");
-    ck_tile::index_t K          = arg_parser.get_int("k");
-
-    ck_tile::index_t stride_a = arg_parser.get_int("stride_a");
-    ck_tile::index_t stride_b = arg_parser.get_int("stride_b");
-    ck_tile::index_t stride_c = arg_parser.get_int("stride_c");
-
-    gemm_basic_args args;
-    args.p_a     = a_buf.GetDeviceBuffer();
-    args.p_b     = b_buf.GetDeviceBuffer();
-    args.p_c     = c_buf.GetDeviceBuffer();
-    args.epsilon = epsilon;
-    args.kbatch  = batch_size;
-    args.M       = M;
-    args.N       = N;
-    args.K       = K;
-
-    // Only set stride_M and stride_N if they are non-zero and not equal to K.
-    if(stride_a != 0)
-    {
-        args.stride_A = stride_a;
-    }
-    else
-    {
-        args.stride_A = [&]() {
-            if constexpr(std::is_same_v<LayoutA, ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                return M;
-            }
-            else
-            {
-                return K;
-            }
-        }();
-    }
-
-    if(stride_b != 0)
-    {
-        args.stride_B = stride_b;
-    }
-    else
+    if(s.log_level_ > 0)
     {
-        args.stride_B = [&]() {
-            if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                return N;
-            }
-            else
-            {
-                return K;
-            }
-        }();
+        std::cout << "Launching kernel with args:"
+                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
     }
 
-    if(stride_c != 0)
-    {
-        args.stride_C = stride_c;
-    }
-    else
-    {
-        args.stride_C = [&]() {
-            if constexpr(std::is_same_v<LayoutC, ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                return M;
-            }
-            else
-            {
-                return N;
-            }
-        }();
-    }
-
-    float ave_time = gemm_calc<LayoutA, LayoutB, LayoutC, PipelineProblem, GemmPipeline, GemmShape>(
-        args, ck_tile::stream_config{nullptr, true});
-    std::size_t num_byte =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
-    std::cout << "The overall perfomance of the GEMM with "
-              << "[" << data_type << "]"
-              << "batch size: " << batch_size << ". m:" << M << ", n:" << N << ", k:" << K
-              << " is: \n";
-    std::cout << "Running time: " << ave_time << "ms, Throughput " << gb_per_sec << "GB/s \n"
-              << std::flush;
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
     return ave_time;
 }
 
-int main(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    ck_tile::index_t M = arg_parser.get_int("m");
-    ck_tile::index_t N = arg_parser.get_int("n");
-    ck_tile::index_t K = arg_parser.get_int("k");
-
-    // The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N).
-    using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor;
-    using matrix_b_layout = ck_tile::tensor_layout::gemm::ColumnMajor;
-    using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor;
-
-    // host verify
-    std::vector<int> a_dimensions =
-        (std::is_same_v<matrix_a_layout, ck_tile::tensor_layout::gemm::RowMajor>)
-            ? std::vector<int>{M, K}
-            : std::vector<int>{K, M};
-    std::vector<int> b_dimensions =
-        (std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
-            ? std::vector<int>{N, K}
-            : std::vector<int>{K, N};
-    std::vector<int> c_dimensions =
-        (std::is_same_v<matrix_c_layout, ck_tile::tensor_layout::gemm::RowMajor>)
-            ? std::vector<int>{M, N}
-            : std::vector<int>{N, M};
-
-    ck_tile::HostTensor<ADataType> a_host(a_dimensions);
-    ck_tile::HostTensor<BDataType> b_host(b_dimensions);
-
-    ck_tile::HostTensor<CDataType> c_host_ref(c_dimensions);
-    ck_tile::HostTensor<CDataType> c_host_dev(c_dimensions);
-
-    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
-    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_host);
-
-    ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_buf(c_host_dev.get_element_space_size_in_bytes());
-
-    a_buf.ToDevice(a_host.data());
-    b_buf.ToDevice(b_host.data());
-
-    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadA = true;
-    constexpr bool kPadB = true;
-    constexpr bool kPadC = true;
-
-    // This part comes from the Codegen
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 128;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-
-    using CodegenGemmTraits = ck_tile::
-        TileGemmTraits<kPadA, kPadB, kPadC, matrix_a_layout, matrix_b_layout, matrix_c_layout>;
-
-    using CodegenPipelineProblem = ck_tile::
-        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-
-    using CodegenGemmPolicy = ck_tile::
-        UniversalGemmPipelineAgBgCrPolicy<matrix_a_layout, matrix_b_layout, matrix_c_layout>;
-
-    using CodegenGemmPipeline =
-        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
-
-    invoke_gemm<ck_tile::half_t,
-                matrix_a_layout,
-                matrix_b_layout,
-                matrix_c_layout,
-                CodegenPipelineProblem,
-                CodegenGemmPipeline,
-                CodegenGemmShape>(a_buf, b_buf, c_buf, arg_parser);
-
-    c_buf.FromDevice(c_host_dev.data());
-
-    bool pass_cpu = true;
-
-    if(arg_parser.get_int("v") == 1)
-    {
-        // ToDo: Will Add the Element Op (bias) verification in the future.
-        ck_tile::reference_gemm<ADataType,
-                                BDataType,
-                                AccDataType,
-                                CDataType,
-                                matrix_a_layout,
-                                matrix_b_layout,
-                                matrix_c_layout>(a_host, b_host, c_host_ref);
-
-        pass_cpu = ck_tile::check_err(c_host_dev, c_host_ref);
-
-        std::cout << "The CPU veification result is:" << (pass_cpu ? "correct" : "fail")
-                  << std::flush;
-    }
-
-    bool pass_gpu = true;
-
-    if(arg_parser.get_int("v") == 2)
-    {
-        ck_tile::index_t stride_a = arg_parser.get_int("stride_a");
-        ck_tile::index_t stride_b = arg_parser.get_int("stride_b");
-        ck_tile::index_t stride_c = arg_parser.get_int("stride_c");
-
-        if(stride_a == 0)
-        {
-            if constexpr(std::is_same_v<matrix_a_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                stride_a = M;
-            }
-            else
-            {
-                stride_a = K;
-            }
-        }
-
-        if(stride_b == 0)
-        {
-            if constexpr(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                stride_b = N;
-            }
-            else
-            {
-                stride_b = K;
-            }
-        }
-
-        if(stride_c == 0)
-        {
-            if constexpr(std::is_same_v<matrix_c_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                stride_c = M;
-            }
-            else
-            {
-                stride_c = N;
-            }
-        }
-
-        ck_tile::HostTensor<CDataType> c_host_gpu_ref(c_dimensions);
-        ck_tile::DeviceMem c_gpu_buf(c_host_gpu_ref.get_element_space_size_in_bytes());
+#include "run_gemm_example.inc"
 
-        ck_tile::reference_gemm_gpu<ADataType,
-                                    BDataType,
-                                    AccDataType,
-                                    CDataType,
-                                    matrix_a_layout,
-                                    matrix_b_layout,
-                                    matrix_c_layout>(
-            a_buf, b_buf, c_gpu_buf, M, N, K, stride_a, stride_b, stride_c);
-
-        c_buf.FromDevice(c_host_gpu_ref.data());
-
-        pass_gpu = ck_tile::check_err(c_host_dev, c_host_gpu_ref);
-
-        std::cout << "The GPU veification result is: " << (pass_gpu ? "correct" : "fail")
-                  << std::flush;
-    }
-
-    std::cout << std::endl << std::flush;
-
-    return !pass_gpu;
-}
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp
index ce2e0f706..23e99bc2a 100644
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
@@ -4,12 +4,10 @@
 
 #pragma once
 
+#include <string>
+
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
-#include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/host.hpp"
-#include <string>
 
 template <typename DataType>
 struct GemmBasicTypeConfig;
@@ -20,7 +18,7 @@ struct GemmBasicTypeConfig<ck_tile::half_t>
     using ADataType   = ck_tile::half_t;
     using BDataType   = ck_tile::half_t;
     using AccDataType = float;
-    using CDataType   = ck_tile::half_t; // type convert
+    using CDataType   = ck_tile::half_t;
     // ToDo: Add more bias config to support different categories of GEMM.
 };
 
@@ -58,7 +56,6 @@ struct gemm_basic_args
     const void* p_a;
     const void* p_b;
     void* p_c;
-    float epsilon;
     ck_tile::index_t kbatch;
     ck_tile::index_t M;
     ck_tile::index_t N;
@@ -68,5 +65,28 @@ struct gemm_basic_args
     ck_tile::index_t stride_C;
 };
 
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("b", "1", "batch size")
+        .insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "2048", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "R", "B tensor data layout - Row by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
 // host API
 float gemm_calc(gemm_basic_args args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
new file mode 100644
index 000000000..2ee0395e4
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_basic.hpp"
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
+{
+    // ToDo: This will be modified by the codegen code later.
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadA = true;
+    constexpr bool kPadB = true;
+    constexpr bool kPadC = true;
+
+    constexpr int kBlockPerCu = 1;
+
+    // ===============================================
+
+    using GemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
+
+    using GemmEpilogue = ck_tile::Default2DEpilogue<
+        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, false, kPadC>>;
+
+    using Traits = ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+
+    using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
+
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+
+        using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<
+            ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                  BDataType,
+                                                  AccDataType,
+                                                  GemmShape,
+                                                  Traits,
+                                                  ck_tile::GemmPipelineScheduler::Intrawave,
+                                                  has_hot_loop_v,
+                                                  tail_number_v>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKargs(args.p_a,
+                                       args.p_b,
+                                       args.p_c,
+                                       args.M,
+                                       args.N,
+                                       args.K,
+                                       args.stride_A,
+                                       args.stride_B,
+                                       args.stride_C);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        return ave_time;
+    };
+
+    if(has_hot_loop)
+    {
+        // Tail pipeline One to Seven
+        if(tail_num == ck_tile::TailNumber::One)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+
+        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
+        {
+            if(tail_num == ck_tile::TailNumber::Two)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
+        {
+            if(tail_num == ck_tile::TailNumber::Three)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
+        {
+            if(tail_num == ck_tile::TailNumber::Four)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
+        {
+            if(tail_num == ck_tile::TailNumber::Five)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
+        {
+            if(tail_num == ck_tile::TailNumber::Six)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
+        {
+            if(tail_num == ck_tile::TailNumber::Seven)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+            }
+        }
+    }
+    else
+    {
+        // Tail number always Full - #PrefetchStages
+        if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<false>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "When there's no hot loop, this tail number \"" << tail_num
+                << "\" is not supported! " << __FILE__ << ":" << __LINE__
+                << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
+        }
+    }
+
+    return ave_time;
+}
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
new file mode 100644
index 000000000..8db131738
--- /dev/null
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                  ck_tile::DeviceMem& b_k_n_dev_buf,
+                  ck_tile::DeviceMem& c_m_n_dev_buf,
+                  ck_tile::index_t M,
+                  ck_tile::index_t N,
+                  ck_tile::index_t K,
+                  ck_tile::index_t stride_A,
+                  ck_tile::index_t stride_B,
+                  ck_tile::index_t stride_C,
+                  ck_tile::index_t kbatch,
+                  int n_warmup,
+                  int n_repeat)
+{
+    gemm_basic_args args;
+    args.p_a      = a_m_k_dev_buf.GetDeviceBuffer();
+    args.p_b      = b_k_n_dev_buf.GetDeviceBuffer();
+    args.p_c      = c_m_n_dev_buf.GetDeviceBuffer();
+    args.kbatch   = kbatch;
+    args.M        = M;
+    args.N        = N;
+    args.K        = K;
+    args.stride_A = stride_A;
+    args.stride_B = stride_B;
+    args.stride_C = stride_C;
+
+    float ave_time = gemm_calc<ALayout, BLayout, CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::string op_name{"Gemm{MemBoundPipeline}"};
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+int run_gemm_example_with_layouts(int argc,
+                                  char* argv[],
+                                  const ALayout a_layout                  = ALayout{},
+                                  const BLayout b_layout                  = BLayout{},
+                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t batch_size = arg_parser.get_int("b");
+    int n_warmup                = arg_parser.get_int("warmup");
+    int n_repeat                = arg_parser.get_int("repeat");
+
+    using namespace ck_tile::literals;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride = [](std::size_t row,
+                                   std::size_t col,
+                                   std::size_t stride,
+                                   auto layout) {
+        if(stride == 0)
+        {
+            // give a chance if stride is zero, return a default packed stride
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return col;
+            }
+            else
+            {
+                return row;
+            }
+        }
+        else
+            return stride;
+    };
+
+    stride_A = f_get_default_stride(M, K, stride_A, a_layout);
+    stride_B = f_get_default_stride(K, N, stride_B, b_layout);
+    stride_C = f_get_default_stride(M, N, stride_C, CLayout{});
+
+    ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, a_layout));
+    ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, b_layout));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+
+    // TODO: add different init types
+
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    b_k_n_dev_buf.ToDevice(b_k_n.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    invoke_gemm<ALayout, BLayout, CLayout>(a_m_k_dev_buf,
+                                           b_k_n_dev_buf,
+                                           c_m_n_dev_buf,
+                                           M,
+                                           N,
+                                           K,
+                                           stride_A,
+                                           stride_B,
+                                           stride_C,
+                                           batch_size,
+                                           n_warmup,
+                                           n_repeat);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+
+        pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref);
+
+        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(
+            a_m_k_dev_buf, b_k_n_dev_buf, c_m_n_gpu_buf_ref, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+        pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref);
+
+        std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    std::string a_layout = arg_parser.get_str("a_layout");
+    std::string b_layout = arg_parser.get_str("b_layout");
+
+    if(a_layout == "R" && b_layout == "R")
+    {
+        return run_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "C")
+    {
+        return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "C")
+    {
+        return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "R")
+    {
+        return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index fa4b8d3cc..2c423831e 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -56,6 +56,7 @@
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
+#include "ck_tile/core/utility/literals.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
 #include "ck_tile/core/utility/philox_rand.hpp"
 #include "ck_tile/core/utility/random.hpp"
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index 06b5a8da0..f150fc54c 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -46,6 +46,31 @@ CK_TILE_DEVICE auto load_tile(const tile_window_linear<BottomTensorView_,
     return tile_window.load(number<-1>{}, bool_constant<oob_conditional_check>{});
 }
 
+template <typename DistributedTensor_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          index_t NumCoord,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
+                              const tile_window_with_static_distribution<BottomTensorView_,
+                                                                         WindowLengths_,
+                                                                         TileDistribution_,
+                                                                         NumCoord>& tile_window,
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(dst_tile, bool_constant<oob_conditional_check>{});
+}
+
+/**
+ * @brief Loads a tile of data using inline assembly.
+ *
+ * @note Bare in mind that loading data this way, you have to manually initialize your
+ *       thread buffer and synchronize load afterwards in order to make sure it's done before
+ *       using loaded data from registers
+ *       @see `tile_window_with_static_distribution::init_raw()` and `buffer_view.hpp`
+ *       @see  `buffer_load_fence()`
+ */
 template <typename T,
           typename BottomTensorView_,
           typename WindowLengths_,
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index ca3507827..e41024698 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -290,15 +290,22 @@ struct tile_window_with_static_distribution
     CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
     {
-        using Traits = load_store_traits;
+        constexpr auto tile_dstr = TileDstr{};
+        auto dst_tensor          = make_static_distributed_tensor<DataType>(tile_dstr);
+        load(dst_tensor, bool_constant<oob_conditional_check>{});
+        return dst_tensor;
+    }
 
+    template <typename DistributedTensor, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        using Traits   = load_store_traits;
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
         constexpr auto tile_dstr = TileDstr{};
 
-        auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
-
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
             /// TODO: use structure binding (to be captured later) if compiled in C++20
@@ -353,8 +360,6 @@ struct tile_window_with_static_distribution
                 }
             });
         });
-
-        return dst_tensor;
     }
 
     template <typename DstTile,
diff --git a/include/ck_tile/core/utility/literals.hpp b/include/ck_tile/core/utility/literals.hpp
new file mode 100644
index 000000000..6f64f09f4
--- /dev/null
+++ b/include/ck_tile/core/utility/literals.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+namespace ck_tile {
+namespace literals {
+// [P0330] Literal Suffix for (signed) size_t (C++23)
+// ref: https://wg21.link/p0330r8
+inline constexpr std::size_t operator""_uz(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+
+inline constexpr std::size_t operator""_zu(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+} // namespace literals
+} // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index a496c91e0..dbdef0e9c 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -1,12 +1,13 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include <cstdlib>
+#include <thread>
+
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/host_tensor.hpp"
-#include "ck_tile/ops/common/tensor_layout.hpp"
-#include <thread>
 
 namespace ck_tile {
 
@@ -14,55 +15,36 @@ template <typename ADataType,
           typename BDataType,
           typename AccDataType,
           typename CDataType,
-          typename LayoutA,
-          typename LayoutB,
-          typename LayoutC,
           typename AElementOp   = ck_tile::identity,
           typename BElementOp   = ck_tile::identity,
           typename ACCElementOp = ck_tile::identity>
 CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
-                                 const HostTensor<BDataType>& b_n_k,
+                                 const HostTensor<BDataType>& b_k_n,
                                  HostTensor<CDataType>& c_m_n,
                                  const AElementOp& a_element_op     = {},
                                  const BElementOp& b_element_op     = {},
                                  const ACCElementOp& acc_element_op = {})
 {
-    const int N = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
-                      ? b_n_k.mDesc.get_lengths()[0]
-                      : b_n_k.mDesc.get_lengths()[1];
-    const int K = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
-                      ? a_m_k.mDesc.get_lengths()[1]
-                      : a_m_k.mDesc.get_lengths()[0];
-    const int M = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
-                      ? a_m_k.mDesc.get_lengths()[0]
-                      : a_m_k.mDesc.get_lengths()[1];
-
-    auto f = [&](auto m) {
-        for(int n = 0; n < N; ++n)
+    const std::size_t M = a_m_k.get_length(0);
+    const std::size_t N = b_k_n.get_length(1);
+    const std::size_t K = a_m_k.get_length(1);
+
+    auto f_mn = [&](auto m, auto n) {
+        AccDataType v_acc = 0;
+
+        for(std::size_t k = 0; k < K; ++k)
         {
-            AccDataType v_acc = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                ADataType v_a = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
-                                    ? a_element_op(a_m_k(m, k))
-                                    : a_element_op(a_m_k(k, m));
-                BDataType v_b = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
-                                    ? b_element_op(b_n_k(n, k))
-                                    : b_element_op(b_n_k(k, n));
-
-                v_acc += ck_tile::type_convert<AccDataType>(v_a) *
-                         ck_tile::type_convert<AccDataType>(v_b);
-            }
-
-            CDataType& c_ref = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
-                                   ? c_m_n(m, n)
-                                   : c_m_n(n, m);
-            c_ref            = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
+            ADataType v_a = a_element_op(a_m_k(m, k));
+            BDataType v_b = b_element_op(b_k_n(k, n));
+
+            v_acc +=
+                ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
         }
+
+        c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
     };
 
-    make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
 }
 
 template <typename ADataType,
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 4ca773479..c3e028528 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -24,6 +24,8 @@
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
index 8d9e24638..d50179c1a 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
@@ -32,7 +32,7 @@ struct BlockGemmARegBGmemCRegV1
         BlockGemmProblem<ADataType, BDataType, CDataType, kBlockSize, BlockGemmShape>,
         BlockGemmARegBGmemCRegV1DefaultPolicy>;
 
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLdsSize()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
     {
         return sizeof(BDataType) *
                Policy::template MakeBSmemBlockDescriptor<Problem>().get_element_space_size();
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
index dc0b41135..d6fee879b 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
@@ -24,19 +24,19 @@ struct BlockGemmASmemBSmemCRegV1
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
     // C += A * B
-    template <typename CBlockTensor, typename ABlockWindowTmp, typename BBlockWindowTmp>
+    template <typename CBlockTensor, typename ABlockWindow, typename BBlockWindow>
     CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
-                                   const ABlockWindowTmp& a_block_window_tmp,
-                                   const BBlockWindowTmp& b_block_window_tmp) const
+                                   const ABlockWindow& a_block_window,
+                                   const BBlockWindow& b_block_window) const
     {
-        static_assert(std::is_same_v<ADataType, typename ABlockWindowTmp::DataType> &&
-                          std::is_same_v<BDataType, typename BBlockWindowTmp::DataType> &&
+        static_assert(std::is_same_v<ADataType, typename ABlockWindow::DataType> &&
+                          std::is_same_v<BDataType, typename BBlockWindow::DataType> &&
                           std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                       "wrong!");
 
-        constexpr index_t MPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<0>{}];
-        constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
-        constexpr index_t KPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<1>{}];
+        constexpr index_t MPerBlock = ABlockWindow{}.get_window_lengths()[number<0>{}];
+        constexpr index_t NPerBlock = BBlockWindow{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockWindow{}.get_window_lengths()[number<1>{}];
 
         static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN &&
                           KPerBlock == BlockGemmShape::kK,
@@ -62,9 +62,9 @@ struct BlockGemmASmemBSmemCRegV1
 
         // construct A-warp-window
         auto a_warp_window_tmp = make_tile_window(
-            a_block_window_tmp.get_bottom_tensor_view(),
+            a_block_window.get_bottom_tensor_view(),
             make_tuple(number<WG::kM>{}, number<WG::kK>{}),
-            a_block_window_tmp.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
             make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
 
 #if 0 // FIXME: using array will cause register spill
@@ -97,9 +97,9 @@ struct BlockGemmASmemBSmemCRegV1
 
         // construct B-warp-window
         auto b_warp_window_tmp = make_tile_window(
-            b_block_window_tmp.get_bottom_tensor_view(),
+            b_block_window.get_bottom_tensor_view(),
             make_tuple(number<WG::kN>{}, number<WG::kK>{}),
-            b_block_window_tmp.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0},
+            b_block_window.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0},
             make_static_tile_distribution(typename WG::BWarpDstrEncoding{}));
 
 #if 0 // FIXME: using array will cause register spill
@@ -200,12 +200,12 @@ struct BlockGemmASmemBSmemCRegV1
     }
 
     // C = A * B
-    template <typename ABlockTensorTmp, typename BBlockWindowTmp>
+    template <typename ABlockTensorTmp, typename BBlockWindow>
     CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
-                                   const BBlockWindowTmp& b_block_window_tmp) const
+                                   const BBlockWindow& b_block_window) const
     {
         auto c_block_tensor = MakeCBlockTile();
-        operator()(c_block_tensor, a_block_tensor_tmp, b_block_window_tmp);
+        operator()(c_block_tensor, a_block_tensor_tmp, b_block_window);
         return c_block_tensor;
     }
 };
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 48329c8ba..1671ddad3 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -3,12 +3,13 @@
 
 #pragma once
 
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
 #include <iostream>
-
 #include <string>
 
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+
 namespace ck_tile {
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
@@ -17,20 +18,19 @@ struct GemmKernel
     using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
     using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
     using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
-    static constexpr index_t KernelBlockSize = GemmPipeline::kBlockSize;
-
-    using ADataType    = remove_cvref_t<typename GemmPipeline::ADataType>;
-    using BDataType    = remove_cvref_t<typename GemmPipeline::BDataType>;
-    using CAccDataType = remove_cvref_t<typename GemmPipeline::CDataType>;
-    using CODataType   = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
 
-    using LayoutA = remove_cvref_t<typename GemmPipeline::LayoutA>;
-    using LayoutB = remove_cvref_t<typename GemmPipeline::LayoutB>;
-    using LayoutC = remove_cvref_t<typename GemmPipeline::LayoutC>;
+    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    // using CAccDataType = remove_cvref_t<typename GemmPipeline::CDataType>;
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
-    __host__ static constexpr auto GridSize(index_t M_size, index_t N_size, index_t Batch_size)
+    __host__ static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
     {
-        return TilePartitioner::GridSize(M_size, N_size, Batch_size);
+        return TilePartitioner::GridSize(M, N, KBatch);
     }
 
     __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
@@ -40,34 +40,30 @@ struct GemmKernel
         const void* a_ptr;
         const void* b_ptr;
         void* c_ptr;
-
-        float epsilon;
-
-        ck_tile::index_t M;
-        ck_tile::index_t N;
-        ck_tile::index_t K;
-        ck_tile::index_t stride_A;
-        ck_tile::index_t stride_B;
-        ck_tile::index_t stride_C;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t stride_A;
+        index_t stride_B;
+        index_t stride_C;
     };
 
     CK_TILE_HOST static constexpr GemmCommonKargs MakeKargs(const void* a_ptr,
                                                             const void* b_ptr,
                                                             void* c_ptr,
-                                                            float epsilon,
-                                                            ck_tile::index_t M,
-                                                            ck_tile::index_t N,
-                                                            ck_tile::index_t K,
-                                                            ck_tile::index_t stride_A,
-                                                            ck_tile::index_t stride_B,
-                                                            ck_tile::index_t stride_C)
+                                                            index_t M,
+                                                            index_t N,
+                                                            index_t K,
+                                                            index_t stride_A,
+                                                            index_t stride_B,
+                                                            index_t stride_C)
     {
-        return GemmCommonKargs{a_ptr, b_ptr, c_ptr, epsilon, M, N, K, stride_A, stride_B, stride_C};
+        return GemmCommonKargs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C};
     }
 
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
-        return ck_tile::max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
     CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const
@@ -78,13 +74,13 @@ struct GemmKernel
         const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
         // Convert pointers to tensor views
         auto a_tensor_view = [&]() {
-            if constexpr(std::is_same_v<LayoutA, tensor_layout::gemm::ColumnMajor>)
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global>(
                     a_start,
                     make_tuple(kargs.M, kargs.K),
-                    make_tuple(1, kargs.stride_A),
-                    number<GemmPipeline::AlignmentA>{},
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::VectorSizeA>{},
                     number<1>{});
             }
             else
@@ -92,29 +88,29 @@ struct GemmKernel
                 return make_naive_tensor_view<address_space_enum::global>(
                     a_start,
                     make_tuple(kargs.M, kargs.K),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::AlignmentA>{},
+                    make_tuple(1, kargs.stride_A),
+                    number<1>{},
                     number<1>{});
             }
         }();
 
         auto b_tensor_view = [&]() {
-            if constexpr(std::is_same_v<LayoutB, tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global>(
                     b_start,
                     make_tuple(kargs.N, kargs.K),
                     make_tuple(1, kargs.stride_B),
-                    number<GemmPipeline::AlignmentB>{},
+                    number<1>{},
                     number<1>{});
             }
             else
-            { // Default NK layout
+            {
                 return make_naive_tensor_view<address_space_enum::global>(
                     b_start,
                     make_tuple(kargs.N, kargs.K),
                     make_tuple(kargs.stride_B, 1),
-                    number<GemmPipeline::AlignmentB>{},
+                    number<GemmPipeline::VectorSizeB>{},
                     number<1>{});
             }
         }();
@@ -122,10 +118,12 @@ struct GemmKernel
         auto a_pad_view = pad_tensor_view(
             a_tensor_view,
             make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
-            sequence < 0,
-            GemmPipeline::kPadA ? 1 : 0 > {});
+            // somehow clang-format is splitting below line into multiple.
+            // clang-format off
+            sequence<false, GemmPipeline::kPadA>{});
+        // clang-format on
 
-        auto ABlockWindow = make_tile_window(
+        auto a_block_window = make_tile_window(
             a_pad_view,
             make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
             {i_m, 0});
@@ -133,10 +131,11 @@ struct GemmKernel
         auto b_pad_view = pad_tensor_view(
             b_tensor_view,
             make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
-            sequence < 0,
-            GemmPipeline::kPadB ? 1 : 0 > {});
+            // clang-format off
+            sequence<false, GemmPipeline::kPadB>{});
+        // clang-format on
 
-        auto BBlockWindow = make_tile_window(
+        auto b_block_window = make_tile_window(
             b_pad_view,
             make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
             {i_n, 0});
@@ -144,20 +143,21 @@ struct GemmKernel
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
 
-        const index_t num_loop = (kargs.K + TilePartitioner::kK - 1) / TilePartitioner::kK;
-
-        auto acc = GemmPipeline{}(ABlockWindow, BBlockWindow, num_loop, smem_ptr);
+        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
 
-        CODataType* c_start = static_cast<CODataType*>(kargs.c_ptr);
+        // Run GEMM cooperatively by whole wokrgroup.
+        auto c_block_tile =
+            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
 
+        CDataType* c_start = static_cast<CDataType*>(kargs.c_ptr);
         auto c_tensor_view = [&]() {
-            if constexpr(std::is_same_v<LayoutC, tensor_layout::gemm::ColumnMajor>)
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global>(
                     c_start,
                     make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_C),
-                    number<GemmPipeline::AlignmentC>{},
+                    make_tuple(kargs.stride_C, 1),
+                    number<GemmPipeline::VectorSizeC>{},
                     number<1>{});
             }
             else
@@ -165,8 +165,8 @@ struct GemmKernel
                 return make_naive_tensor_view<address_space_enum::global>(
                     c_start,
                     make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_C, 1),
-                    number<GemmPipeline::AlignmentC>{},
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
                     number<1>{});
             }
         }();
@@ -174,14 +174,15 @@ struct GemmKernel
         auto c_pad_view = pad_tensor_view(
             c_tensor_view,
             make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
-            sequence < 0,
-            GemmPipeline::kPadC ? 1 : 0 > {});
-        auto CBlockWindow_pad = make_tile_window(
+            // clang-format off
+            sequence<false, GemmPipeline::kPadC>{});
+        // clang-format on
+        auto c_block_window = make_tile_window(
             c_pad_view,
             make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
             {i_m, i_n});
 
-        EpiloguePipeline{}(CBlockWindow_pad, acc);
+        EpiloguePipeline{}(c_block_window, c_block_tile);
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index a49ffc291..6387233c0 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -9,26 +9,30 @@ namespace ck_tile {
 template <typename BlockGemmShape_>
 struct GemmTilePartitioner
 {
-    using BlockGemmShape = ck_tile::remove_cvref_t<BlockGemmShape_>;
+    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
-    static constexpr ck_tile::index_t kM = BlockGemmShape::kM;
-    static constexpr ck_tile::index_t kN = BlockGemmShape::kN;
-    static constexpr ck_tile::index_t kK = BlockGemmShape::kK;
+    static constexpr index_t kM = BlockGemmShape::kM;
+    static constexpr index_t kN = BlockGemmShape::kN;
+    static constexpr index_t kK = BlockGemmShape::kK;
 
-    CK_TILE_HOST static constexpr auto
-    GridSize(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t batch_size)
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t batch_size)
     {
-        ck_tile::index_t GridDimX = (M + kM - 1) / kM;
-        ck_tile::index_t GridDimY = (N + kN - 1) / kN;
-        ck_tile::index_t GridDimZ = batch_size;
+        index_t GridDimX = (M + kM - 1) / kM;
+        index_t GridDimY = (N + kN - 1) / kN;
+        index_t GridDimZ = batch_size;
         return dim3(GridDimX, GridDimY, GridDimZ);
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K)
+    {
+        return integer_divide_ceil(K, kK);
+    }
+
     CK_TILE_DEVICE auto operator()()
     {
         const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kM);
         const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kN);
-        return ck_tile::make_tuple(iM, iN);
+        return make_tuple(iM, iN);
     }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
new file mode 100644
index 000000000..b9b45d3f4
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+
+namespace ck_tile {
+
+//  A Tile Window: global memory
+//  B Tile Window: global memory
+//  C Distributed tensor: register
+template <typename Problem>
+struct BaseGemmPipelineAgBgCrMem
+{
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    // TODO: Is this 32K value gfx9 arch specific?
+    static constexpr index_t MinMemInFlyBytes = 32768;
+
+    static constexpr index_t WgpPerCU =
+        (4 * get_warp_size() / BlockSize) >= 1 ? 4 * get_warp_size() / BlockSize : 1;
+    static constexpr index_t FullMemBandPrefetchStages = integer_divide_ceil(
+        MinMemInFlyBytes / WgpPerCU,
+        (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
+    static constexpr index_t PrefetchStages =
+        FullMemBandPrefetchStages >= 2
+            ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8
+            : 2;
+
+    static constexpr index_t LocalPrefillStages = 1;
+    static constexpr index_t GlobalBufferNum    = PrefetchStages;
+
+    CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    {
+        if(num_loop % PrefetchStages == 1)
+        {
+            return TailNumber::One;
+        }
+        else if(num_loop % PrefetchStages == 2)
+        {
+            return TailNumber::Two;
+        }
+        else if(num_loop % PrefetchStages == 3)
+        {
+            return TailNumber::Three;
+        }
+        else if(num_loop % PrefetchStages == 4)
+        {
+            return TailNumber::Four;
+        }
+        else if(num_loop % PrefetchStages == 5)
+        {
+            return TailNumber::Five;
+        }
+        else if(num_loop % PrefetchStages == 6)
+        {
+            return TailNumber::Six;
+        }
+        else if(num_loop % PrefetchStages == 7)
+        {
+            return TailNumber::Seven;
+        }
+        else
+        {
+            return TailNumber::Full;
+        }
+    }
+};
+
+// Maximum Global Memory throughput pipeline with >=32KB data in fly
+// GlobalPrefetchStages: >=2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy>
+struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
+{
+    using Base = BaseGemmPipelineAgBgCrMem<Problem>;
+
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
+    using I0        = number<0>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t VectorSizeA = Problem::VectorSizeA;
+    static constexpr index_t VectorSizeB = Problem::VectorSizeB;
+    static constexpr index_t VectorSizeC = Problem::VectorSizeC;
+
+    static constexpr bool kPadA = Problem::kPadA;
+    static constexpr bool kPadB = Problem::kPadB;
+    static constexpr bool kPadC = Problem::kPadC;
+
+    // Where is the right place for HasHotLoop and TailNum ???
+    static constexpr bool HasHotLoop = Problem::HasHotLoop;
+    static constexpr auto TailNum    = Problem::TailNum;
+    static constexpr auto Scheduler  = Problem::Scheduler;
+
+    using Base::PrefetchStages;
+
+    CK_TILE_HOST_DEVICE constexpr index_t GetStaticLdsSize()
+    {
+        return integer_divide_ceil(
+                   sizeof(ADataType) *
+                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
+                   16) *
+                   16 +
+               sizeof(BDataType) *
+                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <GemmPipelineScheduler Scheduler>
+    struct PipelineImpl
+    {
+    };
+
+    template <>
+    struct PipelineImpl<GemmPipelineScheduler::Intrawave>
+    {
+        template <typename DstBlockTile, typename SrcTileWindow>
+        CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile,
+                                           SrcTileWindow& dram_tile_window) const
+        {
+            load_tile(dst_block_tile, dram_tile_window);
+            move_tile_window(dram_tile_window, {0, KPerBlock});
+        }
+
+        template <typename DstTileWindow, typename SrcBlockTile, typename ElementFunction>
+        CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
+                                         const SrcBlockTile& src_block_tile,
+                                         const ElementFunction& element_func) const
+        {
+            const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
+            store_tile(lds_tile_window, block_tile_tmp);
+        }
+
+        template <bool HasHotLoop,
+                  TailNumber TailNum,
+                  typename ADramBlockWindowTmp,
+                  typename BDramBlockWindowTmp,
+                  typename AElementFunction,
+                  typename BElementFunction>
+        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                       const AElementFunction& a_element_func,
+                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BElementFunction& b_element_func,
+                                       index_t num_loop,
+                                       void* p_smem) const
+        {
+            static_assert(
+                std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                    std::is_same_v<BDataType,
+                                   remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+                "A/B Dram block window should have the same data type as appropriate "
+                "([A|B]DataType) defined in Problem definition!");
+
+            static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                              NPerBlock ==
+                                  BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                              KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                          "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock"
+                          " or KPerBlock!");
+
+            // ------------------------------------------------------------------------------------
+            // Definitions of all needed tiles
+
+            // A tile in LDS
+            ADataType* p_a_lds              = static_cast<ADataType*>(p_smem);
+            constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+            auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+            // TODO: LDS alignment should come from Policy!
+            constexpr index_t a_lds_block_space_size_aligned =
+                integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(),
+                                    16) *
+                16;
+
+            // B tile in LDS
+            BDataType* p_b_lds = static_cast<BDataType*>(
+                static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
+            constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+            auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+
+            // A DRAM tile window for load
+            auto a_copy_dram_window =
+                make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 a_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeADramTileDistribution<Problem>());
+
+            // A LDS tile window for store
+            auto a_copy_lds_window =
+                make_tile_window(a_lds_block,
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 a_copy_dram_window.get_tile_distribution());
+            // B DRAM tile window for load
+            auto b_copy_dram_window =
+                make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 b_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeBDramTileDistribution<Problem>());
+
+            // B LDS tile window for store
+            auto b_copy_lds_window =
+                make_tile_window(b_lds_block,
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 b_copy_dram_window.get_tile_distribution());
+
+            // A LDS tile for block GEMM
+            auto a_lds_gemm_window = make_tile_window(
+                a_lds_block, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+            // B LDS tile for block GEMM
+            auto b_lds_gemm_window = make_tile_window(
+                b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+            // Block GEMM
+            constexpr auto block_gemm = BlockGemm();
+            auto c_block_tile         = block_gemm.MakeCBlockTile();
+
+            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
+            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+
+            using ABlockTile =
+                decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+            using BBlockTile =
+                decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+
+            tuple_array<ABlockTile, PrefetchStages> a_block_tiles;
+            tuple_array<BBlockTile, PrefetchStages> b_block_tiles;
+
+            // -----------------------------------------------------------------------------------------
+            // Gemm pipeline start
+
+            // prefetch
+            // global read 0
+            GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window);
+            GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window);
+
+            // initialize C
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            // LDS write 0
+            LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
+            LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
+
+            // Global prefetch [1, PrefetchStages]
+            static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) {
+                GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}), a_copy_dram_window);
+                GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}), b_copy_dram_window);
+            });
+
+            // main body
+            if constexpr(HasHotLoop)
+            {
+                index_t i = 0;
+                do
+                {
+                    static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) {
+                        block_sync_lds();
+                        // block_gemm.LocalPrefetch();
+                        block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+
+                        block_sync_lds();
+
+                        LocalPrefill(
+                            a_copy_lds_window,
+                            a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
+                            a_element_func);
+                        LocalPrefill(
+                            b_copy_lds_window,
+                            b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
+                            b_element_func);
+
+                        GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
+                                       a_copy_dram_window);
+                        GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
+                                       b_copy_dram_window);
+                    });
+
+                    i += PrefetchStages;
+                } while(i < (num_loop - PrefetchStages));
+            }
+
+            auto HotLoopTail = [&](auto tail_num) {
+                static_for<1, tail_num, 1>{}([&](auto prefetch_idx) {
+                    block_sync_lds();
+
+                    // block_gemm.LocalPrefetch();
+                    block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+
+                    block_sync_lds();
+                    LocalPrefill(a_copy_lds_window,
+                                 a_block_tiles.get(number<prefetch_idx>{}),
+                                 a_element_func);
+                    LocalPrefill(b_copy_lds_window,
+                                 b_block_tiles.get(number<prefetch_idx>{}),
+                                 b_element_func);
+                });
+
+                block_sync_lds();
+                // block_gemm.LocalPrefetch();
+                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+            };
+
+            if constexpr(TailNum == TailNumber::One)
+            {
+                block_sync_lds();
+                // block_gemm.LocalPrefetch();
+                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+            }
+            else if constexpr(TailNum == TailNumber::Two)
+            {
+                HotLoopTail(number<2>{});
+            }
+            else if constexpr(TailNum == TailNumber::Three)
+            {
+                HotLoopTail(number<3>{});
+            }
+            else if constexpr(TailNum == TailNumber::Four)
+            {
+                HotLoopTail(number<4>{});
+            }
+            else if constexpr(TailNum == TailNumber::Five)
+            {
+                HotLoopTail(number<5>{});
+            }
+            else if constexpr(TailNum == TailNumber::Six)
+            {
+                HotLoopTail(number<6>{});
+            }
+            else if constexpr(TailNum == TailNumber::Seven)
+            {
+                HotLoopTail(number<7>{});
+            }
+            else if constexpr(TailNum == TailNumber::Full)
+            {
+                HotLoopTail(number<PrefetchStages>{});
+            }
+
+            return c_block_tile;
+        }
+    };
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
+            a_dram_block_window_tmp,
+            a_element_func,
+            b_dram_block_window_tmp,
+            b_element_func,
+            num_loop,
+            p_smem);
+    }
+
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_dram_block_window_tmp,
+            [](const BDataType& b) { return b; },
+            num_loop,
+            p_smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
new file mode 100644
index 000000000..5e93ca21c
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ostream>
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+enum struct GemmPipelineScheduler
+{
+    Intrawave,
+    Interwave,
+};
+
+enum struct TailNumber
+{
+    // Single / Double buffer pipeline
+    Odd,
+    Even,
+
+    // Long prefetch pipeline, up to 8
+    One,
+    Two,
+    Three,
+    Four,
+    Five,
+    Six,
+    Seven,
+
+    // Unroll stages > Prefetch stages, number of loop is multiple of unroll stages
+    Empty,
+    // Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add
+    // prefetchstages
+    Full,
+};
+
+} // namespace ck_tile
+
+inline std::ostream& operator<<(std::ostream& os, const ck_tile::GemmPipelineScheduler& s)
+{
+    switch(s)
+    {
+    case ck_tile::GemmPipelineScheduler::Intrawave: os << "Intrawave"; break;
+    case ck_tile::GemmPipelineScheduler::Interwave: os << "Interwave"; break;
+    default: os << "";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const ck_tile::TailNumber& s)
+{
+    switch(s)
+    {
+    case ck_tile::TailNumber::Odd: os << "Odd"; break;
+    case ck_tile::TailNumber::Even: os << "Even"; break;
+    case ck_tile::TailNumber::One: os << "One"; break;
+    case ck_tile::TailNumber::Two: os << "Two"; break;
+    case ck_tile::TailNumber::Three: os << "Three"; break;
+    case ck_tile::TailNumber::Four: os << "Four"; break;
+    case ck_tile::TailNumber::Five: os << "Five"; break;
+    case ck_tile::TailNumber::Six: os << "Six"; break;
+    case ck_tile::TailNumber::Seven: os << "Seven"; break;
+    case ck_tile::TailNumber::Empty: os << "Empty"; break;
+    case ck_tile::TailNumber::Full: os << "Full"; break;
+    default: os << "";
+    }
+    return os;
+}
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 5ed7d036e..a2424290e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,27 +19,27 @@ struct GemmPipelineAGmemBGmemCRegV1
     using CDataType      = remove_cvref_t<typename Problem::CDataType>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
-    static constexpr index_t kBlockSize = Problem::kBlockSize;
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
 
     static constexpr index_t kMPerBlock = BlockGemmShape::kM;
     static constexpr index_t kNPerBlock = BlockGemmShape::kN;
     static constexpr index_t kKPerBlock = BlockGemmShape::kK;
 
-    static constexpr index_t AlignmentA = Problem::AlignmentA;
-    static constexpr index_t AlignmentB = Problem::AlignmentB;
-    static constexpr index_t AlignmentC = Problem::AlignmentC;
+    static constexpr index_t VectorSizeA = Problem::VectorSizeA;
+    static constexpr index_t VectorSizeB = Problem::VectorSizeB;
+    static constexpr index_t VectorSizeC = Problem::VectorSizeC;
 
     static constexpr bool kPadA = Problem::kPadA;
     static constexpr bool kPadB = Problem::kPadB;
     static constexpr bool kPadC = Problem::kPadC;
 
-    using LayoutA = remove_cvref_t<typename Problem::LayoutA>;
-    using LayoutB = remove_cvref_t<typename Problem::LayoutB>;
-    using LayoutC = remove_cvref_t<typename Problem::LayoutC>;
-
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLdsSize()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
     {
-        return ck_tile::integer_divide_ceil(
+        return integer_divide_ceil(
                    sizeof(ADataType) *
                        Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
                    16) *
@@ -48,7 +48,7 @@ struct GemmPipelineAGmemBGmemCRegV1
                    Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
     }
 
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         return Policy::template GetSmemSize<Problem>();
     }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index 8639f00fb..199ba56aa 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -71,8 +71,6 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
     {
-        using namespace ck_tile;
-
         constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
 
@@ -93,7 +91,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeA()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
     {
         constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
                                         MakeALdsBlockDescriptor<Problem>().get_element_space_size();
@@ -101,7 +99,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeB()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
     {
         constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
                                         MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
@@ -109,7 +107,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
         constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index bff7fc0a0..96a5a61c8 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -25,9 +25,9 @@ struct GemmPipelineAGmemBGmemCRegV2
     static constexpr index_t kNPerBlock = BlockGemmShape::kN;
     static constexpr index_t kKPerBlock = BlockGemmShape::kK;
 
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLdsSize()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
     {
-        return ck_tile::integer_divide_ceil(
+        return integer_divide_ceil(
                    sizeof(ADataType) *
                        Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
                    16) *
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index d7b3b24a4..1156f549b 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -1,14 +1,15 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck_tile/core.hpp"
-
-#define VectorLoadSize 16
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 
 namespace ck_tile {
 
+static constexpr int _VectorSize = 16;
+
 template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
@@ -22,18 +23,52 @@ struct GemmPipelineProblem
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
     using GemmTraits     = remove_cvref_t<TileGemmTraits_>;
 
+    using ALayout = remove_cvref_t<typename GemmTraits::ALayout>;
+    using BLayout = remove_cvref_t<typename GemmTraits::BLayout>;
+    using CLayout = remove_cvref_t<typename GemmTraits::CLayout>;
+
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
     static constexpr bool kPadA         = GemmTraits::kPadA;
     static constexpr bool kPadB         = GemmTraits::kPadB;
     static constexpr bool kPadC         = GemmTraits::kPadC;
 
-    using LayoutA = remove_cvref_t<typename GemmTraits::LayoutA>;
-    using LayoutB = remove_cvref_t<typename GemmTraits::LayoutB>;
-    using LayoutC = remove_cvref_t<typename GemmTraits::LayoutC>;
+    static constexpr index_t VectorSizeA = kPadA ? 1 : _VectorSize / sizeof(ADataType);
+    static constexpr index_t VectorSizeB = kPadB ? 1 : _VectorSize / sizeof(BDataType);
+    static constexpr index_t VectorSizeC = kPadC ? 1 : _VectorSize / sizeof(CDataType);
+};
+
+template <typename ADataType_,
+          typename BDataType_,
+          typename CDataType_,
+          typename BlockGemmShape_,
+          typename TileGemmTraits_,
+          GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
+          bool HasHotLoop_                 = true,
+          TailNumber TailNum_              = TailNumber::Full>
+struct UniversalGemmPipelineProblem
+{
+    using ADataType      = remove_cvref_t<ADataType_>;
+    using BDataType      = remove_cvref_t<BDataType_>;
+    using CDataType      = remove_cvref_t<CDataType_>;
+    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
+    using GemmTraits     = remove_cvref_t<TileGemmTraits_>;
+
+    using ALayout = remove_cvref_t<typename GemmTraits::ALayout>;
+    using BLayout = remove_cvref_t<typename GemmTraits::BLayout>;
+    using CLayout = remove_cvref_t<typename GemmTraits::CLayout>;
+
+    static constexpr auto Scheduler     = Scheduler_;
+    static constexpr auto HasHotLoop    = HasHotLoop_;
+    static constexpr auto TailNum       = TailNum_;
+    static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
+
+    static constexpr bool kPadA = GemmTraits::kPadA;
+    static constexpr bool kPadB = GemmTraits::kPadB;
+    static constexpr bool kPadC = GemmTraits::kPadC;
 
-    static constexpr index_t AlignmentA = kPadA ? 1 : VectorLoadSize / sizeof(ADataType);
-    static constexpr index_t AlignmentB = kPadB ? 1 : VectorLoadSize / sizeof(BDataType);
-    static constexpr index_t AlignmentC = kPadC ? 1 : VectorLoadSize / sizeof(CDataType);
+    static constexpr index_t VectorSizeA = kPadA ? _VectorSize / sizeof(ADataType) : 1;
+    static constexpr index_t VectorSizeB = kPadB ? _VectorSize / sizeof(BDataType) : 1;
+    static constexpr index_t VectorSizeC = kPadC ? _VectorSize / sizeof(CDataType) : 1;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index 98da1510c..9d050be2f 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -1,27 +1,25 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
-#include "ck_tile/core.hpp"
-
 namespace ck_tile {
 
 template <bool kPadA_,
           bool kPadB_,
           bool kPadC_,
-          typename LayoutA_,
-          typename LayoutB_,
-          typename LayoutC_>
+          typename ALayout_,
+          typename BLayout_,
+          typename CLayout_>
 struct TileGemmTraits
 {
     static constexpr bool kPadA = kPadA_;
     static constexpr bool kPadB = kPadB_;
     static constexpr bool kPadC = kPadC_;
 
-    using LayoutA = LayoutA_;
-    using LayoutB = LayoutB_;
-    using LayoutC = LayoutC_;
+    using ALayout = ALayout_;
+    using BLayout = BLayout_;
+    using CLayout = CLayout_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index dd164e72e..bb59a7298 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -39,9 +39,9 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
 #if defined(__gfx9__)
         c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0);
 #else
-        ck_tile::ignore = c_vec;
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = c_vec;
+        ignore = a_vec;
+        ignore = b_vec;
 #endif
     }
 
@@ -52,8 +52,8 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
         return bit_cast<CVecType>(
             __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0));
 #else
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = a_vec;
+        ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
@@ -90,9 +90,9 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
 #if defined(__gfx9__)
         c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0);
 #else
-        ck_tile::ignore = c_vec;
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = c_vec;
+        ignore = a_vec;
+        ignore = b_vec;
 #endif
     }
 
@@ -103,8 +103,8 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
         return bit_cast<CVecType>(
             __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
 #else
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = a_vec;
+        ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
@@ -154,9 +154,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
                 0);
         });
 #else
-        ck_tile::ignore = c_vec;
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = c_vec;
+        ignore = a_vec;
+        ignore = b_vec;
 #endif
     }
 
@@ -181,8 +181,8 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
         });
         return c_vec;
 #else
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = a_vec;
+        ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
@@ -231,9 +231,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
                 0);
         });
 #else
-        ck_tile::ignore = c_vec;
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = c_vec;
+        ignore = a_vec;
+        ignore = b_vec;
 #endif
     }
 
@@ -258,8 +258,8 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
         });
         return c_vec;
 #else
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = a_vec;
+        ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
@@ -320,9 +320,9 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
             c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
         });
 #else
-        ck_tile::ignore = c_vec;
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = c_vec;
+        ignore = a_vec;
+        ignore = b_vec;
 #endif
     }
 
@@ -356,8 +356,8 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
         });
         return c_vec;
 #else
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
+        ignore = a_vec;
+        ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 99cd5d787..4183d9cb9 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,40 +21,40 @@ struct WarpGemmMfmaDispatcher;
 
 // clang-format off
 // fp16
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
 
 // bf16
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
 
 // fp8
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<fp8_t, fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<fp8_t, fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<fp8_t, bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<fp8_t, bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<bf8_t, fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<bf8_t, fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<bf8_t, bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<bf8_t, bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
 // clang-format on
 } // namespace impl
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 9075ca2ed..ac9c4311d 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(image_to_column)
+add_subdirectory(gemm)
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
new file mode 100644
index 000000000..f96ad9c6e
--- /dev/null
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -0,0 +1,4 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp)
+endif()
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
new file mode 100644
index 000000000..f72a80b5a
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_mem_pipeline_util.hpp"
+
+using F16 = ck_tile::half_t;
+using F32 = float;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes);
+
+#include "test_gemm_mem_pipeline_ut_cases.inc"
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
new file mode 100644
index 000000000..b26114f39
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
@@ -0,0 +1,41 @@
+#pragma once
+
+TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 1024;
+    constexpr int K = 320;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
+
+TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 1024;
+    constexpr int K = 320;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
+
+TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 1024;
+    constexpr int K = 432;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
+
+TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 1024;
+    constexpr int K = 512;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
new file mode 100644
index 000000000..1b243ab43
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <sstream>
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+template <typename Tuple>
+class TestCkTileGemmMemPipeline : public ::testing::Test
+{
+    protected:
+    using ALayout     = std::tuple_element_t<0, Tuple>;
+    using BLayout     = std::tuple_element_t<1, Tuple>;
+    using CLayout     = std::tuple_element_t<2, Tuple>;
+    using ADataType   = std::tuple_element_t<3, Tuple>;
+    using BDataType   = std::tuple_element_t<4, Tuple>;
+    using AccDataType = std::tuple_element_t<5, Tuple>;
+    using CDataType   = std::tuple_element_t<6, Tuple>;
+    // TODO: expose tile size through test t-param ?
+
+    struct gemm_basic_args
+    {
+        const void* p_a;
+        const void* p_b;
+        void* p_c;
+        ck_tile::index_t kbatch;
+        ck_tile::index_t M;
+        ck_tile::index_t N;
+        ck_tile::index_t K;
+        ck_tile::index_t stride_A;
+        ck_tile::index_t stride_B;
+        ck_tile::index_t stride_C;
+    };
+
+    void invoke_gemm(const gemm_basic_args& args, const ck_tile::stream_config& s)
+    {
+        // TODO: This should be parameterized in tests
+        constexpr ck_tile::index_t M_Tile = 128;
+        constexpr ck_tile::index_t N_Tile = 128;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+        constexpr bool kPadA = true;
+        constexpr bool kPadB = true;
+        constexpr bool kPadC = true;
+
+        constexpr int kBlockPerCu = 1;
+
+        // ===============================================
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
+
+        using GemmEpilogue = ck_tile::Default2DEpilogue<
+            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, false, kPadC>>;
+
+        using Traits = ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
+
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<
+                ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      GemmShape,
+                                                      Traits,
+                                                      ck_tile::GemmPipelineScheduler::Intrawave,
+                                                      has_hot_loop_v,
+                                                      tail_number_v>>;
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKargs(args.p_a,
+                                           args.p_b,
+                                           args.p_c,
+                                           args.M,
+                                           args.N,
+                                           args.K,
+                                           args.stride_A,
+                                           args.stride_B,
+                                           args.stride_C);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Lunching kernel with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        };
+
+        if(has_hot_loop)
+        {
+            // Tail pipeline One to Seven
+            if(tail_num == ck_tile::TailNumber::One)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+
+            if constexpr(BaseGemmPipeline::PrefetchStages > 2)
+            {
+                if(tail_num == ck_tile::TailNumber::Two)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber,
+                                                   ck_tile::TailNumber::Two>{});
+                }
+            }
+            if constexpr(BaseGemmPipeline::PrefetchStages > 3)
+            {
+                if(tail_num == ck_tile::TailNumber::Three)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber,
+                                                   ck_tile::TailNumber::Three>{});
+                }
+            }
+            if constexpr(BaseGemmPipeline::PrefetchStages > 4)
+            {
+                if(tail_num == ck_tile::TailNumber::Four)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber,
+                                                   ck_tile::TailNumber::Four>{});
+                }
+            }
+            if constexpr(BaseGemmPipeline::PrefetchStages > 5)
+            {
+                if(tail_num == ck_tile::TailNumber::Five)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber,
+                                                   ck_tile::TailNumber::Five>{});
+                }
+            }
+            if constexpr(BaseGemmPipeline::PrefetchStages > 6)
+            {
+                if(tail_num == ck_tile::TailNumber::Six)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber,
+                                                   ck_tile::TailNumber::Six>{});
+                }
+            }
+            if constexpr(BaseGemmPipeline::PrefetchStages > 7)
+            {
+                if(tail_num == ck_tile::TailNumber::Seven)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber,
+                                                   ck_tile::TailNumber::Seven>{});
+                }
+            }
+        }
+        else
+        {
+            // Tail number always Full - #PrefetchStages
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else
+            {
+                std::ostringstream err;
+                err << "When there's no hot loop, this tail number \"" << tail_num
+                    << "\" is not supported! " << __FILE__ << ":" << __LINE__
+                    << ", in function: " << __func__;
+                throw std::runtime_error(err.str());
+            }
+        }
+    }
+
+    public:
+    std::vector<int> k_batches_;
+
+    void SetUp() override { k_batches_ = {1}; }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA = 0,
+             const int StrideB = 0,
+             const int StrideC = 0)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int kbatch = 1)
+    {
+        using namespace ck_tile::literals;
+
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    // give a chance if stride is zero, return a default packed stride
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        std::size_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
+        std::size_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
+        std::size_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
+
+        ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
+        ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+
+        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5}(a_m_k);
+        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5}(b_k_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        gemm_basic_args args;
+        args.p_a      = a_m_k_dev_buf.GetDeviceBuffer();
+        args.p_b      = b_k_n_dev_buf.GetDeviceBuffer();
+        args.p_c      = c_m_n_dev_buf.GetDeviceBuffer();
+        args.kbatch   = kbatch;
+        args.M        = M;
+        args.N        = N;
+        args.K        = K;
+        args.stride_A = stride_A;
+        args.stride_B = stride_B;
+        args.stride_C = stride_C;
+
+        invoke_gemm(args, ck_tile::stream_config{nullptr, false});
+
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+        bool pass = true;
+
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+
+        pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref);
+        EXPECT_TRUE(pass);
+    }
+};
-- 
GitLab


From 7d9111545f7541b16aca7c52c871314402983596 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Wed, 30 Oct 2024 23:13:30 +0800
Subject: [PATCH 027/153] clang-format (#1612)

---
 .../ck_tile/ops/reduce/block/block_reduce.hpp  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index d9df949cf..fa3007d1e 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -16,8 +16,8 @@ namespace ck_tile {
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
 template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
-                                                          const ReduceFunc& reduce_func,
-                                                          bool_constant<WithBroadcast> = {})
+                                           const ReduceFunc& reduce_func,
+                                           bool_constant<WithBroadcast> = {})
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -116,7 +116,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
  */
 template <typename AccDistributedTensor_, typename ReduceFunc>
 CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor,
-                                                              const ReduceFunc& reduce_func)
+                                               const ReduceFunc& reduce_func)
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -175,9 +175,9 @@ template <typename AccDistributedTensor_,
           index_t... InReduceDims,
           typename ReduceFunc>
 CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor,
-                                                     const InDistributedTensor_& in_tensor,
-                                                     sequence<InReduceDims...>,
-                                                     const ReduceFunc& reduce_func)
+                                      const InDistributedTensor_& in_tensor,
+                                      sequence<InReduceDims...>,
+                                      const ReduceFunc& reduce_func)
 {
     constexpr auto I0 = number<0>{};
     constexpr auto I1 = number<1>{};
@@ -250,9 +250,9 @@ template <typename AccDataType_,
           typename ReduceFunc,
           typename InDataType_>
 CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
-                                                     sequence<InReduceDims...> in_reduce_dims,
-                                                     const ReduceFunc& reduce_func,
-                                                     const InDataType_& reduce_init)
+                                      sequence<InReduceDims...> in_reduce_dims,
+                                      const ReduceFunc& reduce_func,
+                                      const InDataType_& reduce_init)
 {
     using InDataType  = typename InDistributedTensor_::DataType;
     using AccDataType = remove_cvref_t<AccDataType_>;
-- 
GitLab


From 9a8a52130d780ca449ae261bb03ae4783f18f296 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 30 Oct 2024 17:42:50 +0100
Subject: [PATCH 028/153] Remove virtual destructors from unary ops (#1610)

* Remove virtual destructors from unary ops

* Fixes

* Fixes

* clang format fixes
---
 .../element/unary_element_wise_operation.hpp  | 112 +++++++++++++++---
 include/ck_tile/core/numeric/math.hpp         |   2 +-
 .../host/reference/reference_elementwise.hpp  |   2 +-
 .../host/reference/reference_permute.hpp      |   2 +-
 .../reference/reference_rmsnorm2d_fwd.hpp     |   2 +-
 .../add_rmsnorm2d_rdquant_fwd_shape.hpp       |   2 +-
 ...rmsnorm2d_rdquant_fwd_pipeline_problem.hpp |   2 +-
 .../ops/fmha/pipeline/tile_fmha_shape.hpp     |   2 +-
 .../pipeline/generic_petmute_problem.hpp      |   2 +-
 .../rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp  |   2 +-
 .../rmsnorm2d_fwd_pipeline_problem.hpp        |   2 +-
 .../ops/welford/block/block_welford.hpp       |   2 +-
 12 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 712b88618..39b81ca57 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -13,15 +13,17 @@ namespace ck {
 namespace tensor_operation {
 namespace element_wise {
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
 struct UnaryOpBase
 {
     public:
-    __host__ __device__ virtual ~UnaryOpBase() = default;
+    __host__ __device__ ~UnaryOpBase() = default;
 
-    __host__ __device__ UnaryOpBase()                   = default;
-    __host__ __device__ UnaryOpBase(const UnaryOpBase&) = default;
+    __host__ __device__ constexpr UnaryOpBase()                   = default;
+    __host__ __device__ constexpr UnaryOpBase(const UnaryOpBase&) = default;
+    __host__ __device__ constexpr UnaryOpBase(UnaryOpBase&&)      = default;
     __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default;
-    __host__ __device__ UnaryOpBase(UnaryOpBase&&)                 = default;
     __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default;
 
     __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0;
@@ -50,8 +52,14 @@ struct PassThroughPack2
     constexpr const static bool is_pack2_invocable = true;
 };
 
-struct PassThrough : public UnaryOpBase
+struct PassThrough final : public UnaryOpBase
 {
+    __host__ __device__ constexpr PassThrough()                   = default;
+    __host__ __device__ constexpr PassThrough(const PassThrough&) = default;
+    __host__ __device__ constexpr PassThrough(PassThrough&&)      = default;
+    __host__ __device__ PassThrough& operator=(const PassThrough&) = default;
+    __host__ __device__ PassThrough& operator=(PassThrough&&) = default;
+    __host__ __device__ ~PassThrough()                        = default;
 
     __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; }
 
@@ -409,8 +417,15 @@ struct UnarySquare
     };
 };
 
-struct UnaryAbs : public UnaryOpBase
+struct UnaryAbs final : public UnaryOpBase
 {
+    __host__ __device__ constexpr UnaryAbs()                = default;
+    __host__ __device__ constexpr UnaryAbs(const UnaryAbs&) = default;
+    __host__ __device__ constexpr UnaryAbs(UnaryAbs&&)      = default;
+    __host__ __device__ UnaryAbs& operator=(const UnaryAbs&) = default;
+    __host__ __device__ UnaryAbs& operator=(UnaryAbs&&) = default;
+    __host__ __device__ ~UnaryAbs()                     = default;
+
     __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
         y = ck::math::abs(x);
@@ -459,8 +474,15 @@ struct UnarySqrt
     };
 };
 
-struct Relu : public UnaryOpBase
+struct Relu final : public UnaryOpBase
 {
+    __host__ __device__ constexpr Relu()            = default;
+    __host__ __device__ constexpr Relu(const Relu&) = default;
+    __host__ __device__ constexpr Relu(Relu&&)      = default;
+    __host__ __device__ Relu& operator=(const Relu&) = default;
+    __host__ __device__ Relu& operator=(Relu&&) = default;
+    __host__ __device__ ~Relu()                 = default;
+
     __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
         y = x > 0 ? x : 0;
@@ -633,8 +655,14 @@ struct Gelu
     }
 };
 
-struct Sigmoid : public UnaryOpBase
+struct Sigmoid final : public UnaryOpBase
 {
+    __host__ __device__ constexpr Sigmoid()               = default;
+    __host__ __device__ constexpr Sigmoid(const Sigmoid&) = default;
+    __host__ __device__ constexpr Sigmoid(Sigmoid&&)      = default;
+    __host__ __device__ Sigmoid& operator=(const Sigmoid&) = default;
+    __host__ __device__ Sigmoid& operator=(Sigmoid&&) = default;
+    __host__ __device__ ~Sigmoid()                    = default;
 
     __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
@@ -688,8 +716,15 @@ struct Silu
     };
 };
 
-struct TanH : public UnaryOpBase
+struct TanH final : public UnaryOpBase
 {
+    __host__ __device__ constexpr TanH()            = default;
+    __host__ __device__ constexpr TanH(const TanH&) = default;
+    __host__ __device__ constexpr TanH(TanH&&)      = default;
+    __host__ __device__ TanH& operator=(const TanH&) = default;
+    __host__ __device__ TanH& operator=(TanH&&) = default;
+    __host__ __device__ ~TanH()                 = default;
+
     __host__ __device__ inline void operator()(float& y, const float& x) const final
     {
         y = ck::math::tanh(x);
@@ -959,8 +994,12 @@ struct Rcp
     };
 };
 
-struct Swish : public UnaryOpBase
+struct Swish final : public UnaryOpBase
 {
+    __host__ __device__ constexpr Swish(const Swish&) = default;
+    __host__ __device__ constexpr Swish(Swish&&)      = default;
+    __host__ __device__ ~Swish()                      = default;
+
     __host__ __device__ Swish(float beta = 1.0f) : beta_(beta) {}
 
     __host__ __device__ float get_beta() const { return beta_; }
@@ -1019,8 +1058,12 @@ struct Swish : public UnaryOpBase
     }
 };
 
-struct SoftRelu : public UnaryOpBase
+struct SoftRelu final : public UnaryOpBase
 {
+    __host__ __device__ constexpr SoftRelu(const SoftRelu&) = default;
+    __host__ __device__ constexpr SoftRelu(SoftRelu&&)      = default;
+    __host__ __device__ ~SoftRelu()                         = default;
+
     __host__ __device__ SoftRelu(float alpha = 1.0f) : alpha_(alpha) {}
 
     __host__ __device__ float get_alpha() const { return alpha_; }
@@ -1070,8 +1113,12 @@ struct SoftRelu : public UnaryOpBase
     }
 };
 
-struct Power : public UnaryOpBase
+struct Power final : public UnaryOpBase
 {
+    __host__ __device__ constexpr Power(const Power&) = default;
+    __host__ __device__ constexpr Power(Power&&)      = default;
+    __host__ __device__ ~Power()                      = default;
+
     __host__ __device__ Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
         : alpha_(alpha), beta_(beta), gamma_(gamma)
     {
@@ -1148,8 +1195,12 @@ struct Power : public UnaryOpBase
     }
 };
 
-struct ClippedRelu : public UnaryOpBase
+struct ClippedRelu final : public UnaryOpBase
 {
+    __host__ __device__ constexpr ClippedRelu(const ClippedRelu&) = default;
+    __host__ __device__ constexpr ClippedRelu(ClippedRelu&&)      = default;
+    __host__ __device__ ~ClippedRelu()                            = default;
+
     __host__ __device__ ClippedRelu(float alpha = 0.f, float beta = 1.f)
         : alpha_(alpha), beta_(beta)
     {
@@ -1205,8 +1256,11 @@ struct ClippedRelu : public UnaryOpBase
     }
 };
 
-struct LeakyRelu : public UnaryOpBase
+struct LeakyRelu final : public UnaryOpBase
 {
+    __host__ __device__ constexpr LeakyRelu(const LeakyRelu&) = default;
+    __host__ __device__ constexpr LeakyRelu(LeakyRelu&&)      = default;
+    __host__ __device__ ~LeakyRelu()                          = default;
 
     __host__ __device__ LeakyRelu(float alpha = 0.f) : alpha_(alpha) {}
 
@@ -1250,8 +1304,11 @@ struct LeakyRelu : public UnaryOpBase
     }
 };
 
-struct Elu : public UnaryOpBase
+struct Elu final : public UnaryOpBase
 {
+    __host__ __device__ constexpr Elu(const Elu&) = default;
+    __host__ __device__ constexpr Elu(Elu&&)      = default;
+    __host__ __device__ ~Elu()                    = default;
 
     __host__ __device__ Elu(float alpha = 1.f) : alpha_(alpha) {}
 
@@ -1296,8 +1353,11 @@ struct Elu : public UnaryOpBase
     }
 };
 
-struct Logistic : public UnaryOpBase
+struct Logistic final : public UnaryOpBase
 {
+    __host__ __device__ constexpr Logistic(const Logistic&) = default;
+    __host__ __device__ constexpr Logistic(Logistic&&)      = default;
+    __host__ __device__ ~Logistic()                         = default;
 
     __host__ __device__ Logistic(float alpha = 1.0f) : alpha_(alpha) {}
 
@@ -1631,8 +1691,23 @@ struct DynamicUnaryOp
 
     __host__ __device__ ~DynamicUnaryOp()
     {
-        if(unary_op_ptr_)
-            delete unary_op_ptr_;
+        switch(unary_op_type_)
+        {
+        case(UnaryOpType::Swish): delete static_cast<Swish*>(unary_op_ptr_); break;
+        case(UnaryOpType::Sigmoid): delete static_cast<Sigmoid*>(unary_op_ptr_); break;
+        case(UnaryOpType::PassThrough): delete static_cast<PassThrough*>(unary_op_ptr_); break;
+        case(UnaryOpType::Logistic): delete static_cast<Logistic*>(unary_op_ptr_); break;
+        case(UnaryOpType::TanH): delete static_cast<TanH*>(unary_op_ptr_); break;
+        case(UnaryOpType::Relu): delete static_cast<Relu*>(unary_op_ptr_); break;
+        case(UnaryOpType::SoftRelu): delete static_cast<SoftRelu*>(unary_op_ptr_); break;
+        case(UnaryOpType::UnaryAbs): delete static_cast<UnaryAbs*>(unary_op_ptr_); break;
+        case(UnaryOpType::Power): delete static_cast<Power*>(unary_op_ptr_); break;
+        case(UnaryOpType::ClippedRelu): delete static_cast<ClippedRelu*>(unary_op_ptr_); break;
+        case(UnaryOpType::LeakyRelu): delete static_cast<LeakyRelu*>(unary_op_ptr_); break;
+        case(UnaryOpType::Elu): delete static_cast<Elu*>(unary_op_ptr_); break;
+
+        default: break;
+        }
     }
 
     __device__ void InitUnaryOpPtrOnDevice()
@@ -1721,6 +1796,7 @@ struct DynamicUnaryOp
     float beta;
     float gamma;
 };
+#pragma clang diagnostic pop
 
 } // namespace element_wise
 } // namespace tensor_operation
diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp
index 0faf1aa04..6bdcb509b 100644
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/host/reference/reference_elementwise.hpp b/include/ck_tile/host/reference/reference_elementwise.hpp
index 809049fa6..65303279b 100644
--- a/include/ck_tile/host/reference/reference_elementwise.hpp
+++ b/include/ck_tile/host/reference/reference_elementwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/host/reference/reference_permute.hpp b/include/ck_tile/host/reference/reference_permute.hpp
index 1c8248340..14ed4f815 100644
--- a/include/ck_tile/host/reference/reference_permute.hpp
+++ b/include/ck_tile/host/reference/reference_permute.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
index db6e92f4c..b14e25a85 100644
--- a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
index a17c53c73..4bc7db434 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
index 106e5086b..2e6406003 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index 570754b22..bb33b5f02 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
index e504ed747..17f18acb5 100644
--- a/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
+++ b/include/ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
index fb484a106..fc4b9f470 100644
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
index 87cab3463..2820e1813 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp
index 623e1e16d..ce73c183e 100644
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
-- 
GitLab


From c3a4800c5fe1f7cbdd00f36b7bc4851e0299ddc9 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 31 Oct 2024 14:54:53 +0800
Subject: [PATCH 029/153] [CK_TILE] layernorm support fused-quant/fused-add
 (#1604)

* add prenorm/postnorm support, refactor using generate.py

* update README

* update README

* fix format

* update some description and fix format

* update format

* format

* use non-raw for loading

* format and update n4096

* dynamic-quant ready

* update readme

* support fused dynamic-quant

* update fused-quant, with smooth

* update README

* update args

* update some based on comment
---
 example/ck_tile/02_layernorm2d/CMakeLists.txt |  31 +-
 example/ck_tile/02_layernorm2d/README.md      |  69 +-
 example/ck_tile/02_layernorm2d/generate.py    | 670 ++++++++++++++++++
 .../instances/layernorm2d_fwd_api.cpp         | 155 ----
 .../layernorm2d_fwd_bf16_n1024_instance.cpp   |  22 -
 .../layernorm2d_fwd_bf16_n1536_instance.cpp   |  13 -
 .../layernorm2d_fwd_bf16_n2048_instance.cpp   |  14 -
 .../layernorm2d_fwd_bf16_n256_instance.cpp    |  12 -
 .../layernorm2d_fwd_bf16_n3072_instance.cpp   |  14 -
 .../layernorm2d_fwd_bf16_n4096_instance.cpp   |  14 -
 ...layernorm2d_fwd_bf16_n4096_tp_instance.cpp |  14 -
 .../layernorm2d_fwd_bf16_n512_instance.cpp    |  13 -
 ...layernorm2d_fwd_bf16_n64_n128_instance.cpp |  12 -
 .../layernorm2d_fwd_bf16_n768_instance.cpp    |  12 -
 .../layernorm2d_fwd_fp16_n1024_instance.cpp   |  22 -
 .../layernorm2d_fwd_fp16_n1536_instance.cpp   |  13 -
 .../layernorm2d_fwd_fp16_n2048_instance.cpp   |  14 -
 .../layernorm2d_fwd_fp16_n256_instance.cpp    |  12 -
 .../layernorm2d_fwd_fp16_n3072_instance.cpp   |  14 -
 .../layernorm2d_fwd_fp16_n4096_instance.cpp   |  14 -
 ...layernorm2d_fwd_fp16_n4096_tp_instance.cpp |  14 -
 .../layernorm2d_fwd_fp16_n512_instance.cpp    |  13 -
 ...layernorm2d_fwd_fp16_n64_n128_instance.cpp |  12 -
 .../layernorm2d_fwd_fp16_n768_instance.cpp    |  12 -
 .../layernorm2d_fwd_instance_common.hpp       |  67 --
 .../02_layernorm2d/layernorm2d_fwd.cpp        | 270 ++++++-
 .../02_layernorm2d/layernorm2d_fwd.hpp        | 103 +--
 .../ck_tile/02_layernorm2d/misc/dquant.png    | Bin 0 -> 36863 bytes
 example/ck_tile/02_layernorm2d/misc/pnorm.png | Bin 0 -> 32113 bytes
 .../02_layernorm2d/script/perf_test.sh        |  66 +-
 .../02_layernorm2d/script/smoke_test.sh       |  54 +-
 include/ck_tile/core.hpp                      |   1 +
 include/ck_tile/core/numeric/int8.hpp         | 104 +++
 include/ck_tile/core/numeric/type_convert.hpp |   4 +
 .../ck_tile/core/tensor/null_tile_window.hpp  |   7 +
 .../reference/reference_layernorm2d_fwd.hpp   |  37 +-
 include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp |   1 +
 include/ck_tile/ops/common.hpp                |   1 +
 .../generic_2d_block_shape.hpp}               |   7 +-
 include/ck_tile/ops/elementwise.hpp           |   1 +
 include/ck_tile/ops/epilogue.hpp              |   2 +
 .../ops/epilogue/default_2d_epilogue.hpp      |  28 +-
 .../ops/epilogue/dynamic_quant_epilogue.hpp   | 140 ++++
 include/ck_tile/ops/fmha.hpp                  |   1 +
 include/ck_tile/ops/gemm.hpp                  |   1 +
 include/ck_tile/ops/image_to_column.hpp       |   1 +
 include/ck_tile/ops/layernorm2d.hpp           |   3 +-
 .../kernel/layernorm2d_fwd_kernel.hpp         | 191 ++++-
 .../layernorm2d_fwd_pipeline_one_pass.hpp     |  82 ++-
 .../layernorm2d_fwd_pipeline_problem.hpp      |  12 +-
 .../layernorm2d_fwd_pipeline_two_pass.hpp     |  79 ++-
 .../pipeline/layernorm2d_fwd_traits.hpp       |  54 ++
 include/ck_tile/ops/permute.hpp               |   1 +
 include/ck_tile/ops/reduce.hpp                |   1 +
 .../ck_tile/ops/reduce/block/block_reduce.hpp |   5 +-
 .../ops/reduce/block/block_reduce2d.hpp       |  26 +-
 include/ck_tile/ops/rmsnorm2d.hpp             |   1 +
 include/ck_tile/ops/softmax.hpp               |   1 +
 include/ck_tile/ops/topk.hpp                  |   1 +
 include/ck_tile/ops/topk_softmax.hpp          |   1 +
 include/ck_tile/ops/welford.hpp               |   1 +
 61 files changed, 1792 insertions(+), 768 deletions(-)
 create mode 100644 example/ck_tile/02_layernorm2d/generate.py
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp
 delete mode 100644 example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp
 create mode 100644 example/ck_tile/02_layernorm2d/misc/dquant.png
 create mode 100644 example/ck_tile/02_layernorm2d/misc/pnorm.png
 create mode 100644 include/ck_tile/core/numeric/int8.hpp
 rename include/ck_tile/ops/{layernorm2d/kernel/layernorm2d_fwd_shape.hpp => common/generic_2d_block_shape.hpp} (96%)
 create mode 100644 include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
 create mode 100644 include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp

diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt
index feae5f791..1bf74bc05 100644
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -1,11 +1,34 @@
+set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd")
+set(LAYERNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
+    "semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
+if(LAYERNORM2D_FWD_ENABLE_APIS  STREQUAL "all")
+  set(LAYERNORM2D_FWD_ENABLE_APIS  ${LAYERNORM2D_FWD_KNOWN_APIS})
+endif()
+
+# generate a list of kernels, but not actually emit files at config sta
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
+  RESULT_VARIABLE ret
+)
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+endif()
+
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/layernorm2d_fwd_blobs.txt LAYERNORM2D_FWD_GEN_BLOBS)
+
+add_custom_command(
+  OUTPUT ${LAYERNORM2D_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
+)
+
 set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
-# not using add_example_executable() to add this target, since we don't want this to have
-# to be included in "make all/install/check"
+
 message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
-file(GLOB INSTANCE_SRCS instances/*.cpp)
 add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
 target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${INSTANCE_SRCS})
+target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
 
 set(EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS)
 
diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md
index 405325a2a..14c6fc0d6 100644
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -1,6 +1,42 @@
 # Layernorm2D forward
 
-This folder contains example for Layernorm2D forward using ck_tile tile-programming implementation.
+This folder contains example for Layernorm2D forward using `ck_tile` tile-programming implementation.
+
+# Implementation and feature support
+
+## welford online algorithm
+We use welfold algorithm to update `mean`/`variance` block by block. For `N <=4096` case we can compute `mean`/`var`/`normalization` within one loop, we call it `one-pass`. For large N case, it is hard to keep `mean`/`var` inside register/LDS and then computation `normalization`, so we need to load input twice, first time to compute `mean`/`var` block-by-block, then load input another time to compute the `normalization`. We call it `two-pass`.
+
+## mean/variance save
+In training case the mean/variance need to store out (TBD, not supported yet)
+
+## prenorm/postnorm
+
+![](misc/pnorm.png)
+
+since [prenorm/postnorm](https://arxiv.org/pdf/1906.01787) is quite common in LLM blocks, this example boosts this feature by kernel fusion. Note that `prenorm`/`postnorm` always need to do elementwise-add a `shortcut` before the actual layernorm computation, and optionally store out the result to global. You can use `-fadd=1` to test `pre-add+store`, or `-fadd=2` to test `pre-add` without store out (not codegen by default).
+
+## smooth-quant/dynamic-quant
+we support smooth/dynamic quantization for `int8` output, by setting `-fquant=1` and `-prec_o=int8`. In this case the output will doing a rowwise dynamic quantization like below. Note that smooth-quant require input a `(1*N)` size per-channel scale(in fp32 in our example, though this is customizable), then elememt-wise multiply the tensor for each row, then compute the rowwise dynamic quant. if set `-fquant=2` will have the input per-channel scale stage, only the dynamic quant. This case is supported in our kernel but by default not generated (TBD: add some filter in generate.py support on-demand codegen)
+![](misc/dquant.png)
+
+```
+# assume output int8, hidden_states is [m, n] shape and in fp16/bf16
+# [m, 1]
+per_token_amax, _ = torch.max(
+     input=torch.abs(hidden_states), 
+     dim=-1, 
+     keepdim=True
+)
+per_token_scale = per_token_amax.to(dtype=torch.float32) / 127.0
+
+# quant hidden_states
+hidden_states = (hidden_states / per_token_scale).to(dtype=torch.int8)
+
+return hidden_states, per_token_scale
+# hidden_states now is int8 will feed to next layer as intput
+# per_token_scale will be used as dequant factor later layer
+```
 
 ## build
 ```
@@ -15,8 +51,35 @@ This will result in an executable `build/bin/tile_example_layernorm2d_fwd`
 ```
 args:
           -m    m dimension (default:3328)
-          -n    m dimension (default:4096)
+          -n    n dimension (default:4096)
+     -stride    stride per row, if -1 then equal to n (default:-1)
           -e    epsilon (default:1e-5)
+    -save_mv    save mean/variance(invstd) or not. set to 1 in training case (default:0)
           -v    cpu validation or not (default:1)
-       -prec    precision (default:fp16)
+      -kname    print kernel name or not (default:1)
+     -prec_i    input precision (default:fp16)
+     -prec_o    output precision, set auto will be the same as input (default:auto)
+    -prec_sx    output quant scale type, set auto will be the same as input. used when fquant=1 (default:auto)
+    -prec_sy    output quant scale type, set auto will be the same as input. used when fquant=1 or 2 (default:auto)
+       -fadd    fused-add, 0:no fused add, 1:preadd+store, 2:preadd only (default:0)
+     -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
+     -warmup    cold iter (default:5)
+     -repeat    hot iter (default:20)
+
 ```
+
+## limitations
+Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, N>8192 case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet.
+
+```
+# some case
+# standard fp16 layernorm 2d, m=10. n=1024
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024
+
+# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant, output in int8
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024 -prec_o=int8 -fquant=1
+
+# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant+fused-add-store, output in int8
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1
+
+```
\ No newline at end of file
diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
new file mode 100644
index 000000000..300f6c05e
--- /dev/null
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -0,0 +1,670 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import sys
+from typing import List, Optional, Any
+import functools
+import itertools
+import copy
+from dataclasses import dataclass
+
+def get_if_str(idx, total, lase_else = True):
+    if idx == 0:
+        return 'if'
+    elif idx < total - 1:
+        return 'else if'
+    else:
+        if lase_else:
+            return 'else'
+        else:
+            return 'else if'
+
+FUSED_ADD_ENUM_STR_MAP = [
+    'no',
+    'pras',      # pre-norm
+    'pra' ]      # post-norm
+
+FUSED_FUSED_SWEEP_STR_MAP = [
+    'no',
+    'dquant' ]
+
+DATA_TYPE_MAP = {'fp32' : 'float',
+                 'fp16' : 'ck_tile::fp16_t',
+                 'bf16' : 'ck_tile::bf16_t',
+                 'int8' : 'ck_tile::int8_t'}
+
+def BOOL_MAP(b_) -> str:
+    if b_:
+        return 'true'
+    else:
+        return 'false'
+
+class layernorm_fwd_codegen:
+    API_TRAITS_DEFINE = """
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename XDataType_,
+          typename YDataType_,
+          typename XScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_,
+          ck_tile::index_t kFusedAdd_ = 0,
+          ck_tile::index_t kFusedQuant_ = 0>
+struct layernorm2d_fwd_traits_
+{
+    using XDataType = ck_tile::remove_cvref_t<XDataType_>;
+    using YDataType = ck_tile::remove_cvref_t<YDataType_>;
+    using XScaleDataType = ck_tile::remove_cvref_t<XScaleDataType_>;
+    using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (warpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / warpSize);
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % warpSize == 0);
+            return ThreadPerBlock_N_ / warpSize;
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN           = kPadN_;
+    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kTwoPass        = kTwoPass_;
+    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
+    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
+};
+
+template <typename XDataType_,
+          typename YDataType_,
+          typename XScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_,
+          int  kFusedAdd_,
+          int  kFusedQuant_>
+using traits_ = layernorm2d_fwd_traits_<XDataType_,
+                                       YDataType_,
+                                       XScaleDataType_,
+                                       YScaleDataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kSaveMeanInvStd_,
+                                       kTwoPass_,
+                                       kFusedAdd_,
+                                       kFusedQuant_>;
+"""
+    API_COMMON_HEADER = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+#include <ck_tile/ops/epilogue.hpp>
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = layernorm2d_fwd_args;
+
+{F_traits_define}
+
+template <typename Traits_>
+float layernorm2d_fwd_(const S& s, A a)
+{{
+    using XDataType = typename Traits_::XDataType;
+    using YDataType = typename Traits_::YDataType;
+    using XScaleDataType = typename Traits_::XScaleDataType;
+    using YScaleDataType = typename Traits_::YScaleDataType;
+    using ComputeDataType = typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::ComputeDataType;
+
+    using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
+        Traits_::kSaveMeanInvStd,
+        Traits_::kTwoPass,
+        static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
+        static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
+    using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::XDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::GammaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::BetaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::ComputeDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::YDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::MeanDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::InvStdDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::XScaleDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, XScaleDataType, YScaleDataType>::YScaleDataType,
+        typename Traits_::Shape,
+        PipelineTraits>;
+
+    using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
+    using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
+
+    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, false,  true/*max3*/>>;
+
+    using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
+
+    using Epilogue = std::conditional_t<Traits_::kFusedQuant == 1, DynamicQuantEpilogue,  Default2DEpilogue>;
+
+    using Kernel = ck_tile::Layernorm2dFwd<Pipeline, Epilogue>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+}}
+
+"""
+
+    API_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+
+{F_traits_define}
+
+// Note: this internal API only declare, not define here, otherwise will block `make -j`
+template <typename Traits_>
+float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a);
+
+float layernorm2d_fwd(layernorm2d_fwd_traits t,
+                      layernorm2d_fwd_args a,
+                      const ck_tile::stream_config& s)
+{{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+
+"""
+
+    API_PER_DTYPE="""    {F_if}(t.prec_i == \"{F_i_type}\" && t.prec_o == \"{F_o_type}\"){{
+{F_per_n_case}
+    }}
+"""
+    API_PER_N_CASE="""        {F_if} {F_N_COND} {{
+{F_inner_dispatch}
+        }}
+"""
+    API_INNER_CASE="""            {F_if} {F_VEC_COND}
+                r={F_instance_func}(s, a);
+"""
+
+    INSTANCE_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_api_common.hpp"
+
+// clang-format off
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv     2p      add  sweep
+{F_instance_def}
+// clang-format on
+
+"""
+
+    def __init__(self, working_path, kernel_filter):
+        self.working_path = working_path
+        self.kernel_filter = kernel_filter
+
+    class k_fuesd_add_enum(IntEnum):
+        F_NO_ADD = 0
+        F_PRE_ADD = 1
+        F_PRE_ADD_STORE_RESIDUAL = 2
+
+    class k_fused_sweep_enum(IntEnum):
+        F_NO_SWEEP = 0
+        F_RENORM = 1
+        F_DYNAMIC_QUANT = 2
+
+    @dataclass
+    class k_traits:
+        F_kPadN : bool
+        F_kSaveMeanInvStd : bool
+        F_kTwoPass : bool
+        F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum
+        F_kFusedQuant : Any  #: layernorm_fwd_codegen.k_fused_sweep_enum
+
+    @dataclass
+    class k_shape:
+        F_BlockTile    : List[int]
+        F_WarpPerBlock : List[int]
+        F_WarpTile     : List[int]
+        F_Vector_      : List[int]
+        @property
+        def F_BlockSize(self) -> int:
+            return functools.reduce(lambda a, b: a*b, self.F_WarpTile)
+
+    @dataclass
+    class k_problem:
+        F_XDataType       : str
+        F_GammaDataType   : str
+        F_BetaDataType    : str
+        F_ComputeDataType : str
+        F_YDataType       : str
+        F_MeanDataType    : str
+        F_InvStdDataType  : str
+        F_BlockShape      : str
+        F_Traits          : Any #k_traits
+
+    @dataclass
+    class k_pipeline_one_pass:
+        F_Problem         : Any #k_problem
+    
+    @dataclass
+    class k_pipeline_two_pass:
+        F_Problem         : Any #k_problem
+
+    @dataclass
+    class default_2d_epilogue_problem:
+        F_AccDataType : str
+        F_ODataType : str
+        F_kPadM : bool
+        F_kPadN : bool
+
+    @dataclass
+    class default_2d_epilogue:
+        F_problem : Any
+
+    @dataclass
+    class k_kernel:
+        F_pipeline : Any
+        F_epilogue : Any
+
+    @dataclass
+    class h_traits:
+        F_XDataType : str
+        F_YDataType : str
+        F_XScaleDataType : str
+        F_YScaleDataType : str
+        F_Repeat_M : int
+        F_Repeat_N : int
+        F_ThreadPerBlock_M : int
+        F_ThreadPerBlock_N : int
+        F_Vector_N : int
+        F_kPadN : bool
+        F_kSaveMeanInvStd_ : bool
+        F_kTwoPass_ : bool
+        F_kFusedAdd : int
+        F_kFusedQuant : int
+
+        @property
+        def trait_name(self) ->str:
+            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            return t_
+
+        # string when calling this kernel
+        @property
+        def call_name(self) -> str:
+            return f'layernorm2d_fwd_<traits_<{self.trait_name}>>'
+
+        # string when define this kernel
+        @property
+        def def_name(self) -> str:
+            return f'template float layernorm2d_fwd_<traits_<{self.trait_name}>>(const S&, A);'
+
+    # this class hold kernel under same source file
+    @dataclass
+    class h_instance:
+        F_DataTypePair : str
+        F_N : str
+        F_add : int
+        F_sweep : int
+        instance_list : List[Any] # List[h_traits]
+
+        @property
+        def name(self) -> str:
+            prec_i, prec_o = self.F_DataTypePair.split(',')
+            dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}'
+            nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}'
+            if self.F_add != 0:
+                nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
+            if self.F_sweep != 0:
+                nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
+            return nnn
+
+        @property
+        def instance_name(self) ->str:
+            return self.name
+
+        @property
+        def content(self) ->str:
+            instance_defs = ''
+            for ins in self.instance_list:
+                instance_defs += ins.def_name + '\n'
+            return layernorm_fwd_codegen.INSTANCE_BASE.format(F_instance_def=instance_defs)
+
+    @property
+    def name_api(self) -> str:
+        return 'layernorm2d_fwd_api'
+
+    @property
+    def name_common_header(self) -> str:
+        return 'layernorm2d_fwd_api_common'
+
+    @property
+    def content_api(self) -> str:
+        # 1 sort based on dtype
+        t_dtype_dict = dict()
+        blobs = self.get_blobs()
+        for blob in blobs:
+            if blob.F_DataTypePair not in t_dtype_dict:
+                t_dtype_dict[blob.F_DataTypePair] = {}
+            if blob.F_N not in t_dtype_dict[blob.F_DataTypePair]:
+                t_dtype_dict[blob.F_DataTypePair][blob.F_N] = []
+            t_dtype_dict[blob.F_DataTypePair][blob.F_N].append(blob)
+
+        d_str = ''
+        for i_d, dtype_ in enumerate(t_dtype_dict):
+            blob_per_t = t_dtype_dict[dtype_]
+            n_str = ''
+            for i_n, n_ in enumerate(blob_per_t):
+                blob_per_n = blob_per_t[n_]
+                inner_str = ""
+                for i_b, b_ in enumerate(blob_per_n):
+                    # generate single kernel instance file
+                    #vec_str = ""
+                    for i_ins, ins in enumerate(b_.instance_list):
+                        idx_in_n = i_b * len(b_.instance_list) + i_ins
+                        len_in_n = len(blob_per_n) * len(b_.instance_list)
+                        # _if = 'if' if i_ins == 0 else 'else if'
+                        if ins.F_kFusedQuant == 0:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
+                        elif ins.F_kFusedQuant == 1:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sx == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_XScaleDataType, f_sy_type=ins.F_YScaleDataType)
+                        elif ins.F_kFusedQuant == 2:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType)
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                                        f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd,
+                                        f_sweep_cond = _sweep_cond)
+                        inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
+                                            F_VEC_COND = _cond, F_instance_func=ins.call_name)
+                    #inner_str = inner_str + vec_str
+                n_cnd = f'(a.n <= {n_})' if (i_n < len(blob_per_t) - 1) else ''
+                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
+            prec_i, prec_o = dtype_.split(',')
+            d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
+
+        api_base = self.API_BASE.format(F_traits_define=self.API_TRAITS_DEFINE, F_dispatch=d_str)
+        return api_base
+
+    @property
+    def content_common_header(self) -> str:
+        return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
+
+    def get_blobs(self):
+        h_traits = layernorm_fwd_codegen.h_traits
+        h_instance = layernorm_fwd_codegen.h_instance
+
+        dynamic_quant_out_dtype = ['int8']
+        # some predefined support range
+        # (prec_i,prec_o) for simplicity this string will be used as key for dict
+        scale_list = [('fp32,fp32')]
+        dtype_list = [('fp16,fp16'), ('bf16,bf16'),
+                        ('fp16,int8'), ('bf16,int8')] # NOTE: only fused-dynamic-quant use int8 out
+        #fused_add_list = [0, 1, 2]
+        #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant
+        fused_add_list = [0, 1]
+        fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
+
+        #                                                       rm  rn  tm   tn  vn  pd     mv     2p     add    sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False,  True,   0,    0)]}
+        total_blob = list()
+        for hs_key in h_trait_dict:
+            hs = h_trait_dict[hs_key]
+            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
+            for dtype, scale_type, fused_add, fused_quant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list):
+                prec_i, prec_o = dtype.split(',')
+                scale_x, scale_y = scale_type.split(',')
+                if prec_o in dynamic_quant_out_dtype and fused_quant != 1:
+                    continue # skip non dynamic quant case
+                if fused_quant == 1 and hs_key == 'big':
+                    continue
+                current_hs = list()
+                for chs_ in hs:
+                    h_ = copy.copy(chs_) # copy the base instance out
+                    h_.F_XDataType = prec_i
+                    h_.F_YDataType = prec_o
+                    h_.F_XScaleDataType = scale_y
+                    h_.F_YScaleDataType = scale_x
+                    h_.F_kFusedAdd = fused_add
+                    h_.F_kFusedQuant = fused_quant
+                    current_hs.append(h_) # + "\n"
+                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
+                current_n_str = 'big' if hs_key == 'big' else current_n
+                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs))
+        return total_blob
+
+    def list_blobs(self) -> None:
+        w_p = Path(self.working_path)
+        list_p = w_p / 'layernorm2d_fwd_blobs.txt'
+        blobs = self.get_blobs()
+        with list_p.open('a') as list_f:
+            # api related file
+            list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
+            list_f.write(str(w_p / (self.name_common_header + ".hpp"))  + "\n")
+            # kernel instance file
+            for b in blobs:
+                list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
+
+    def gen_blobs(self) -> None:
+        w_p = Path(self.working_path)
+        (w_p / (self.name_api + ".cpp")).write_text(self.content_api)
+        (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
+        blobs = self.get_blobs()
+        for b in blobs:
+            (w_p / (b.name + ".cpp")).write_text(b.content)
+
+def list_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs()
+
+
+def gen_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK layernorm kernel",
+    )
+    parser.add_argument(
+        "-a",
+        "--api",
+        default='fwd[all]',
+        required=False,
+        help="supply API(s) to generate (default: fwd). separated by comma."
+    )
+
+    # the directory for list_blobs/gen_blobs to write files into
+    parser.add_argument(
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="the path where all the blobs are going to be generated"
+    )
+
+    # this script have 2 modes
+    # 1) list_blobs mode, will generate a txt file with all the files going to be generated.
+    #    this is useful in build system like cmake to construct source code dependency, by
+    #    reading the content out of this file
+    # 2) gen_blobs mode, will generate the actuall kernel instance and api. If in framework
+    #    like FA, only need to use this mode
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        action='store_true',
+        help="list all the kernels to a file, "
+    )
+
+    parser.add_argument(
+        "-g",
+        "--gen_blobs",
+        action='store_true',
+        help="generate all kernels into different tile"
+    )
+
+    # TODO: if using filter, must apply same value to output_dir and list_blobs
+    parser.add_argument(
+        "-f",
+        "--filter",
+        required=False,
+        help="filter out kernels that need to generate, using fnmatch module"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--traits",
+        default="all",
+        required=False,
+        help="enable/disable some feature. default generate all"
+    )
+
+    parser.add_argument(
+        "-r",
+        "--receipt",
+        default=0,
+        required=False,
+        help="codegen receipt."
+    )
+
+    args = parser.parse_args()
+
+    # print(f'{args.list_blobs}-{args.gen_blobs}')
+    if (args.gen_blobs and args.list_blobs) or ((not args.gen_blobs) and (not args.list_blobs)):
+        print('gen_blobs/list_blobs must specify only one option')
+        sys.exit()
+
+    p = Path(args.working_path)
+    if not p.exists():
+        p.mkdir()
+
+    if args.list_blobs:
+        list_blobs(args)
+    else:
+        gen_blobs(args)
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp
deleted file mode 100644
index f2f51de5d..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_api.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <ck_tile/core.hpp>
-#include "layernorm2d_fwd.hpp"
-
-template <typename DataType_,
-          ck_tile::index_t Repeat_M_,         // each thread repeat along M
-          ck_tile::index_t Repeat_N_,         // each thread repeat along N
-          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
-          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
-          ck_tile::index_t Vector_N_,         // vector size along N
-          bool kPadN_,
-          bool kSaveMeanInvStd_,
-          bool kTwoPass_>
-using trait_ = layernorm2d_fwd_traits_<DataType_,
-                                       Repeat_M_,
-                                       Repeat_N_,
-                                       ThreadPerBlock_M_,
-                                       ThreadPerBlock_N_,
-                                       Vector_N_,
-                                       kPadN_,
-                                       kSaveMeanInvStd_,
-                                       kTwoPass_>;
-
-template <typename data_type>
-float layernorm2d_fwd_b16_(layernorm2d_fwd_traits /*t*/,
-                           layernorm2d_fwd_args a,
-                           const ck_tile::stream_config& s)
-{
-#if 1
-    float r = -1;
-    // clang-format off
-    //                                            rm  rn  tm   tn  vn  pd     mv     2p
-    if(a.n <= 64) {
-            r = layernorm2d_fwd_<trait_<data_type, 1,  1,  4,  64, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 128) {
-        if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type, 1,  1,  4,  64, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type, 1,  2,  4,  64, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 256) {
-        if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 1,  4,  64, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2,  4,  64, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4,  4,  64, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 512) {
-        if (a.n % 8 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 1,  4,  64, 8,  true,  false, false>>(s, a);
-        else if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2,  4,  64, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4,  4,  64, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 8,  4,  64, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 768) {
-        if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 3,  4,  64, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 6,  4,  64, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1,12,  4,  64, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 1024) {
-        if (a.n % 8 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 1, 2,  128, 8,  true,  false, false>>(s, a);
-        else if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 2,  128, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 2,  128, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 1536) {
-        if (a.n % 8 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 4,   64, 8,  true,  false, false>>(s, a);
-        else if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 2,  128, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1,  256, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 6, 1,  256, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 2048) {
-        if (a.n % 8 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 1, 1,  256, 8,  true,  false, false>>(s, a);
-        else if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 8, 1,  256, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 3072) {
-        if (a.n % 8 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1,  128, 8,  true,  false, false>>(s, a);
-        else if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1,  256, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 6, 1,  256, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 3, 1, 1024, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n <= 4096) {
-        if (a.n % 8 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  false, false>>(s, a);
-        else if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  false, false>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  false, false>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  false, false>>(s, a);
-    }
-    else if(a.n > 4096) {
-        if (a.n % 8 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1,  256, 8,  true,  false, true>>(s, a);
-        else if (a.n % 4 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1,  256, 4,  true,  false, true>>(s, a);
-        else if (a.n % 2 == 0)
-            r = layernorm2d_fwd_<trait_<data_type,  1, 2, 1, 1024, 2,  true,  false, true>>(s, a);
-        else
-            r = layernorm2d_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  false, true>>(s, a);
-    }
-    return r;
-#else
-    return layernorm2d_fwd_<trait_<data_type,  1, 1,  1,  256, 4,  true,  false, false>>(s, a);
-#endif
-    // clang-format on
-}
-
-float layernorm2d_fwd(layernorm2d_fwd_traits t,
-                      layernorm2d_fwd_args a,
-                      const ck_tile::stream_config& s)
-{
-
-    float r = -1;
-    if(t.data_type.compare("fp16") == 0)
-    {
-        return layernorm2d_fwd_b16_<ck_tile::fp16_t>(t, a, s);
-    }
-    else if(t.data_type.compare("bf16") == 0)
-    {
-        return layernorm2d_fwd_b16_<ck_tile::bf16_t>(t, a, s);
-    }
-    if(r < 0)
-        throw std::runtime_error("Without supported instances!");
-
-    return r;
-}
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
deleted file mode 100644
index 2a20d1e05..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1024_instance.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-#if 0
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
-
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
-#endif
-
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
deleted file mode 100644
index d043efc86..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n1536_instance.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
deleted file mode 100644
index a6ffc8cd2..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n2048_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp
deleted file mode 100644
index 80beeca67..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n256_instance.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
deleted file mode 100644
index b362a550a..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n3072_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
deleted file mode 100644
index 9c2d78999..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
deleted file mode 100644
index c0c75f878..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n4096_tp_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
deleted file mode 100644
index 1bcd0f8a7..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n512_instance.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 8,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp
deleted file mode 100644
index 6b25fce8c..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n64_n128_instance.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 1,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  1,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp
deleted file mode 100644
index c4400f0f2..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_bf16_n768_instance.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  3,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1,  6,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 12,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
deleted file mode 100644
index 7f0e4898c..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1024_instance.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-#if 0
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , false, false>>(const S&, A);
-
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , false, false>>(const S&, A);
-#endif
-
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
deleted file mode 100644
index 8c3a42cc4..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n1536_instance.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 4,   64, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 2,  128, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 1,  true,  false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
deleted file mode 100644
index 04d8bc153..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n2048_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 1, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 8, 1,  256, 1,  true,  false, false>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp
deleted file mode 100644
index c32574749..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n256_instance.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
deleted file mode 100644
index c71db57a6..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n3072_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,  true,  false, false>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
deleted file mode 100644
index f3ca0932e..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, false>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
deleted file mode 100644
index 242f1d2dd..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n4096_tp_instance.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false, true>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false, true>>(const S&, A);
-
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp
deleted file mode 100644
index e3bfa8e3a..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n512_instance.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 8,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp
deleted file mode 100644
index 90d960cf0..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n64_n128_instance.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 1,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp
deleted file mode 100644
index 0960a95c3..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_fp16_n768_instance.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "layernorm2d_fwd_instance_common.hpp"
-
-// clang-format off
-//                                                       rm  rn  tm  tn  vn  pd     mv     2p
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  3,  4,  64, 4,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1,  6,  4,  64, 2,  true , false, false>>(const S&, A);
-template float layernorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 12,  4,  64, 1,  true , false, false>>(const S&, A);
-// clang-format on
diff --git a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp b/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp
deleted file mode 100644
index 22895e8ed..000000000
--- a/example/ck_tile/02_layernorm2d/instances/layernorm2d_fwd_instance_common.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <ck_tile/core.hpp>
-#include "layernorm2d_fwd.hpp"
-#include <iostream>
-
-#pragma once
-
-using S = ck_tile::stream_config;
-using A = layernorm2d_fwd_args;
-
-template <typename DataType_,
-          ck_tile::index_t Repeat_M_,         // each thread repeat along M
-          ck_tile::index_t Repeat_N_,         // each thread repeat along N
-          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
-          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
-          ck_tile::index_t Vector_N_,         // vector size along N
-          bool kPadN_,
-          bool kSaveMeanInvStd_,
-          bool kTwoPass_>
-using trait_ = layernorm2d_fwd_traits_<DataType_,
-                                       Repeat_M_,
-                                       Repeat_N_,
-                                       ThreadPerBlock_M_,
-                                       ThreadPerBlock_N_,
-                                       Vector_N_,
-                                       kPadN_,
-                                       kSaveMeanInvStd_,
-                                       kTwoPass_>;
-
-template <typename Traits_>
-float layernorm2d_fwd_(const S& s, A a)
-{
-    using DataType = typename Traits_::DataType;
-
-    using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
-        typename LayerNormTypeConfig<DataType>::XDataType,
-        typename LayerNormTypeConfig<DataType>::GammaDataType,
-        typename LayerNormTypeConfig<DataType>::BetaDataType,
-        typename LayerNormTypeConfig<DataType>::ComputeDataType,
-        typename LayerNormTypeConfig<DataType>::YDataType,
-        typename LayerNormTypeConfig<DataType>::MeanDataType,
-        typename LayerNormTypeConfig<DataType>::InvStdDataType,
-        typename Traits_::Shape,
-        Traits_::kPadN,
-        Traits_::kSaveMeanInvStd,
-        Traits_::kTwoPass>;
-
-    using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass<PipelineProblem>;
-    using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
-    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
-
-    using Kernel = ck_tile::Layernorm2dFwd<Pipeline>;
-
-    const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
-    constexpr ck_tile::index_t kBlockPerCu = 1;
-
-    auto kargs = Kernel::MakeKargs(a);
-    if(s.log_level_ > 0)
-        std::cout << ", " << Kernel::GetName() << std::flush;
-
-    return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-}
diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
index 4f12d9103..43f4e8c72 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -1,5 +1,6 @@
 #include "ck_tile/host.hpp"
 #include "layernorm2d_fwd.hpp"
+#include <algorithm>
 #include <cstring>
 
 // different threshold for different dtype
@@ -29,7 +30,16 @@ auto create_args(int argc, char* argv[])
         .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
         .insert("v", "1", "cpu validation or not")
         .insert("kname", "1", "print kernel name or not")
-        .insert("prec", "fp16", "precision")
+        .insert("prec_i", "fp16", "input precision")
+        .insert("prec_o", "auto", "output precision, set auto will be the same as input")
+        .insert("prec_sx",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1")
+        .insert("prec_sy",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1 or 2")
+        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
+        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
         .insert("warmup", "5", "cold iter")
         .insert("repeat", "20", "hot iter");
 
@@ -37,7 +47,11 @@ auto create_args(int argc, char* argv[])
     return std::make_tuple(result, arg_parser);
 }
 
-template <typename DataType, bool SaveMeanVar>
+template <typename InDataType,
+          typename OutDataType,
+          typename XScaleDataType,
+          typename YScaleDataType,
+          bool SaveMeanVar>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t m      = arg_parser.get_int("m");
@@ -45,21 +59,46 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::index_t stride = arg_parser.get_int("stride");
     if(stride < 0)
         stride = n;
-    float epsilon         = arg_parser.get_float("e");
-    std::string data_type = arg_parser.get_str("prec");
-    int kname             = arg_parser.get_int("kname");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    float epsilon       = arg_parser.get_float("e");
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sx = arg_parser.get_str("prec_sx");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sx == "auto")
+    {
+        prec_sx = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+
+    int kname         = arg_parser.get_int("kname");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
+    int fused_add     = arg_parser.get_int("fadd");
+    int fused_quant   = arg_parser.get_int("fquant");
+    if(fused_quant == 1 && prec_o != "int8")
+    {
+        std::cout << "if fused_quant is 1, only support \"-prec_o=int8\" case" << std::endl;
+        return false;
+    }
 
     assert(stride >= n);
 
-    using TypeConfig = LayerNormTypeConfig<DataType>;
+    using TypeConfig = LayerNormTypeConfig<InDataType, OutDataType, XScaleDataType, YScaleDataType>;
 
-    using XDataType     = typename TypeConfig::XDataType;
-    using YDataType     = typename TypeConfig::YDataType;
-    using GammaDataType = typename TypeConfig::GammaDataType;
-    using BetaDataType  = typename TypeConfig::BetaDataType;
+    using XDataType         = typename TypeConfig::XDataType;
+    using YDataType         = typename TypeConfig::YDataType;
+    using GammaDataType     = typename TypeConfig::GammaDataType;
+    using BetaDataType      = typename TypeConfig::BetaDataType;
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
 
     using MeanDataType =
         std::conditional_t<SaveMeanVar, typename TypeConfig::MeanDataType, ck_tile::null_type>;
@@ -73,36 +112,72 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::HostTensor<GammaDataType> gamma_host({n});
     ck_tile::HostTensor<BetaDataType> beta_host({n});
 
+    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {stride, 1});
+
     ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {stride, 1});
     ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {stride, 1});
 
     ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
     ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_dev({m});
+
+    ck_tile::HostTensor<XScaleDataType> x_scale_host({n});
+    ck_tile::HostTensor<XScaleDataType> x_scale_host_dev({n});
 
     ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
     ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
     ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
+    ck_tile::FillUniformDistribution<XScaleDataType>{-1.f, 1.f}(x_scale_host);
 
     ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_scale_buf(x_scale_host_dev.get_element_space_size_in_bytes());
+
+    ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
 
     x_buf.ToDevice(x_host.data());
     gamma_buf.ToDevice(gamma_host.data());
     beta_buf.ToDevice(beta_host.data());
+    x_residual_buf.ToDevice(x_residual_host.data());
+    x_scale_buf.ToDevice(x_scale_host.data());
 
-    std::cout << "[" << data_type << "]"
+    auto prec_str = [&]() {
+        auto base_str = prec_i;
+        if(prec_i != prec_o)
+        {
+            base_str += "|" + prec_o;
+        }
+        if(fused_quant == 1)
+        {
+            base_str += std::string("(") + prec_sy + ")";
+        }
+        return base_str;
+    }();
+
+    std::cout << "[" << prec_str << "]"
               << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
 
-    layernorm2d_fwd_traits traits{data_type, SaveMeanVar};
+    layernorm2d_fwd_traits traits{
+        prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, fused_add, fused_quant};
 
     layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
+                              fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant == 1 ? x_scale_buf.GetDeviceBuffer() : nullptr,
                               gamma_buf.GetDeviceBuffer(),
                               beta_buf.GetDeviceBuffer(),
+
                               y_buf.GetDeviceBuffer(),
-                              nullptr,
-                              nullptr,
+                              fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr,
+                              nullptr, // p_mean, unsupported yet
+                              nullptr, // p_invStd, unsupported yet
+
                               epsilon,
                               m,
                               n,
@@ -111,6 +186,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
     float ave_time = layernorm2d_fwd(
         traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
 
+    if(ave_time < 0)
+    {
+        std::cout << " not supported!" << std::endl << std::flush;
+        return false;
+    }
+
     std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n +
                            sizeof(BetaDataType) * n + sizeof(YDataType) * m * n;
 
@@ -122,6 +203,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(do_validation)
     {
         // reference
+        if(fused_add != 0)
+        {
+            // fused pre_add/pre_add_store
+            // TODO we accumulate directly to x_host for simplcity here...
+
+            std::transform(x_host.mData.cbegin(),
+                           x_host.mData.cend(),
+                           x_residual_host.mData.cbegin(),
+                           x_host.mData.begin(),
+                           std::plus<XDataType>{});
+        }
         ck_tile::reference_layernorm2d_fwd<XDataType,
                                            GammaDataType,
                                            BetaDataType,
@@ -131,13 +223,80 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                            InvStdDataType>(
             x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
 
+        if(fused_quant != 0)
+        {
+            auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
+                int N_ = acc_.mDesc.get_lengths()[1];
+                if(fused_quant == 1)
+                {
+                    for(int n_ = 0; n_ < N_; n_++)
+                    {
+                        // input smooth outlier
+                        acc_(m_, n_) =
+                            acc_(m_, n_) * ck_tile::type_convert<ComputeDataType>(x_scale_host(n_));
+                    }
+                }
+                ComputeDataType absmax = static_cast<ComputeDataType>(0);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    const auto a = ck_tile::abs(acc_(m_, n_));
+                    absmax       = a > absmax ? a : absmax;
+                }
+                // printf("cpu:absmax:%f\n", absmax);
+                ComputeDataType y_scale = absmax / static_cast<ComputeDataType>(127.0);
+                y_scale_host_ref(m_)    = ck_tile::type_convert<YScaleDataType>(y_scale);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
+                }
+            };
+
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(x_host,
+                                                               gamma_host,
+                                                               beta_host,
+                                                               y_host_ref,
+                                                               mean_host_ref,
+                                                               invStd_host_ref,
+                                                               epsilon,
+                                                               dquant_functor);
+        }
+        else
+        {
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(
+                x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
+        }
+
         y_buf.FromDevice(y_host_dev.data());
 
-        auto [rtol, atol] = get_elimit<DataType>();
+        ck_tile::HostTensor<YResidualDataType> sy_host_dev({m, n}, {stride, 1});
+        if(fused_add == 1)
+        {
+            y_residual_buf.FromDevice(sy_host_dev.data());
+        }
+
+        auto [rtol, atol] = get_elimit<InDataType>();
+
         if(stride == n)
         {
             pass = ck_tile::check_err(
                 y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+            if(fused_add == 1)
+            {
+                pass &= ck_tile::check_err(
+                    sy_host_dev, x_host, std::string("ADD Error: Incorrect results!"), rtol, atol);
+            }
         }
         else
         {
@@ -153,8 +312,30 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                                std::string("] Error: Incorrect results!"),
                                            rtol,
                                            atol);
+                if(fused_add == 1)
+                {
+                    std::vector<YResidualDataType> sy_host_dev_row(
+                        sy_host_dev.begin() + i_r * stride, sy_host_dev.begin() + i_r * stride + n);
+                    std::vector<YResidualDataType> sy_host_ref_row(
+                        x_host.begin() + i_r * stride, x_host.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(sy_host_dev_row,
+                                               sy_host_ref_row,
+                                               std::string("ADD[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
             }
         }
+        if(fused_quant == 1)
+        {
+            y_scale_buf.FromDevice(y_scale_host_dev.data());
+            pass &= ck_tile::check_err(y_scale_host_dev,
+                                       y_scale_host_ref,
+                                       std::string("SCALE Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
 
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
@@ -168,23 +349,56 @@ int main(int argc, char* argv[])
     if(!result)
         return -1;
 
-    const std::string data_type = arg_parser.get_str("prec");
-    int save_mv                 = arg_parser.get_int("save_mv");
-    if(data_type == "fp16" && save_mv)
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sx = arg_parser.get_str("prec_sx");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sx == "auto")
     {
-        return run<ck_tile::half_t, true>(arg_parser) ? 0 : -2;
+        prec_sx = "fp32";
     }
-    else if(data_type == "fp16" && !save_mv)
+    if(prec_sy == "auto")
     {
-        return run<ck_tile::half_t, false>(arg_parser) ? 0 : -2;
+        prec_sy = "fp32";
     }
-    else if(data_type == "bf16" && save_mv)
+    int save_mv = arg_parser.get_int("save_mv");
+
+    // no dynamic quant case
+    if(prec_i == "fp16" && prec_o == "fp16" && prec_sx == "fp32" && prec_sy == "fp32" && save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp16" && prec_sx == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32" &&
+            save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+
+    // dynamic quant case, only in inference
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sx == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
     {
-        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
     }
-    else if(data_type == "bf16" && !save_mv)
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sx == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
     {
-        return run<ck_tile::bf16_t, true>(arg_parser) ? 0 : -2;
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
     }
 
     return -3;
diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
index 861e4a023..a0f2db0e8 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
@@ -8,31 +8,35 @@
 #include "ck_tile/ops/layernorm2d.hpp"
 #include <string>
 
-template <typename DataType>
+template <typename InType, typename OutType, typename XScaleDataType_, typename YScaleDataType_>
 struct LayerNormTypeConfig;
 
-template <>
-struct LayerNormTypeConfig<ck_tile::half_t>
+template <typename OutType, typename XScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::half_t, OutType, XScaleDataType_, YScaleDataType_>
 {
     using XDataType       = ck_tile::half_t;
-    using YDataType       = ck_tile::half_t;
+    using YDataType       = OutType;
     using GammaDataType   = ck_tile::half_t;
     using BetaDataType    = ck_tile::half_t;
     using MeanDataType    = ck_tile::half_t;
     using InvStdDataType  = ck_tile::half_t;
     using ComputeDataType = float;
+    using XScaleDataType  = XScaleDataType_;
+    using YScaleDataType  = YScaleDataType_;
 };
 
-template <>
-struct LayerNormTypeConfig<ck_tile::bf16_t>
+template <typename OutType, typename XScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::bf16_t, OutType, XScaleDataType_, YScaleDataType_>
 {
     using XDataType       = ck_tile::bf16_t;
-    using YDataType       = ck_tile::bf16_t;
+    using YDataType       = OutType;
     using GammaDataType   = ck_tile::bf16_t;
     using BetaDataType    = ck_tile::bf16_t;
     using MeanDataType    = ck_tile::bf16_t;
     using InvStdDataType  = ck_tile::bf16_t;
     using ComputeDataType = float;
+    using XScaleDataType  = XScaleDataType_;
+    using YScaleDataType  = YScaleDataType_;
 };
 
 // runtime args
@@ -40,82 +44,21 @@ struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs
 {
 };
 
-// this is used to pattern-match internl kernel implementation, not to instantiate kernel
-template <typename DataType_,
-          ck_tile::index_t Repeat_M_,         // each thread repeat along M
-          ck_tile::index_t Repeat_N_,         // each thread repeat along N
-          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
-          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
-          ck_tile::index_t Vector_N_,         // vector size along N
-          bool kPadN_,
-          bool kSaveMeanInvStd_,
-          bool kTwoPass_>
-struct layernorm2d_fwd_traits_
-{
-    using DataType = ck_tile::remove_cvref_t<DataType_>;
-
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
-    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
-
-    // num of warps along m
-    static constexpr ck_tile::index_t BlockWarps_M = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
-        }
-        else
-        {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
-        }
-    }();
-
-    // num of warps along n
-    static constexpr ck_tile::index_t BlockWarps_N = []() {
-        if constexpr(is_warp_per_row)
-        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return 1;
-        }
-        else
-        {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
-        }
-    }();
-
-    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
-    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
-
-    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
-    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
-
-    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
-    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
-
-    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
-    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
-    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
-    using Vector     = ck_tile::sequence<1, Vector_N_>;
-
-    using Shape = ck_tile::Layernorm2dShape<BlockTile, BlockWarps, WarpTile, Vector>;
-
-    static constexpr bool kPadN           = kPadN_;
-    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
-    static constexpr bool kTwoPass        = kTwoPass_;
-};
-
-template <typename Traits_>
-float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a);
-
 // This is the public API, will be generated by script
 struct layernorm2d_fwd_traits
 {
-    std::string data_type;
-    bool save_mean_var;
+    std::string prec_i; // input precision
+    std::string prec_o; // output precision
+
+    // if fused_quant == 1, need set prec_sx/prec_sy to proper string, otherwise can set
+    // arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise
+    // can set arbitrary(will skip check)
+    std::string prec_sx; // x-scale, used for [1*N] input smooth quant
+    std::string prec_sy; // y-scale, used for [M*1] output for next layer
+
+    bool save_mean_var; //
+    int fused_add;      // 0:no-add, 1:pre-add-store, 2:pre-add
+    int fused_quant;    // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
 };
 
 float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/02_layernorm2d/misc/dquant.png b/example/ck_tile/02_layernorm2d/misc/dquant.png
new file mode 100644
index 0000000000000000000000000000000000000000..28b1a61a14ea6774191fc2ac54f195cb86477f9b
GIT binary patch
literal 36863
zcmce-RahNS(<MxBhu}_vyUW1|1PJaBJh;2NLvVM8;Bs(xcXxMpcjhGDJO6w$&s@ya
z&sEdCrFQkMU2CoC6Cx)gh6IlX4+aK?B>v;O0vH&CAQ;#uSXgk-ou>o{J}|JTe(~?$
zlw6iiGoV~B1~z<D$H~vnp&@@vqOPvJ$CJ>Dgql+;P$B0}ZlBi*+B!QoWZODV>}fhZ
zq>X(d?lJ`zPu+MC%xp<$T53z#Xsu~r4P!<0{{$Ky^oI#U0EztXDqHUpILyB*xh^nY
zh<{f_zkE9XtDzE;;O4(uU^F;iZ2s#C6A=~g@5%=r8kXSSl^_(v{~XTm|L5UoNZEwZ
z0lvD2r2`|}jD1XU@d{#F0ER7F*j$f2q`%e;qkOSRh-BR=u9@J6y{mj_^E|WLw_c2)
z6!&!2M73)=2&ymMH&L5k`nzNwJ9t>l{G9<URnAPeFZIk3na>ez#YJT`is2AZ+b!0-
z13UIsQD;AUhDyq6{&{N;1GP~?%r0|#a;6uI=VqTD*)SuClj{-CzEW@_#DB1#967fI
z|IbgYFrcOb5sl#|aBBNP>)Fp<hi5T`Ar0u&$IVM@*R=wVfUm`Zru0}IiZg2xbdNUN
z(4qYJ3C{p_Sxcs$A%PLR77#j$40Yo+EF@eklA`kAaVd=nS$Y1&k8{gn#Z3!4tTnr4
z$l@;Q;w#TwX(5k`?vOIYGsIr0I9YkwCd(tsf66=Jnhr@_|61<z!SUBILt+<n0K0+y
zbzXZFYs7($6))SDeB(NZbm~-B)Fj_2w*Ig9FI+@Kv%^ZA?m8sg5OeM;`4CY%xV3zu
zU((RUT}qqTA9E^D)bZm;Pg>5d&jO^R3XK<H8{-ljB!Qyz9vPF%md<&>XDt;<Cd@=3
z@@)OT99J!)P^=8eKeyA{#Hc~FeY8J*`HH|AX_}4eEjE&|!C884B0o=w_V4u}{&m`@
z;)8KU<Atd?7Hb_04Vb^P&YBHrEZJpHX8p$8eLGgt)598$1W<shRSVzy;DOwZo1|&%
zIL?E}dG13<{Si>sO!Pj`BsP~p*F)hTY~EtU?SA;JtfuaWk>330??!KJ7biyEp^-I?
z>m(xX(Ce%Y#;@!bg-%^@sZjlU!!6c)S&8zMxz;-|IiFyNDIHcE&e?tSm&^D(pT&iM
z&FzXBDhJWM-8WI3I_^Zr2M&$~?gm+go(zAtRJH|}Z>dA8@Mc9I3_xw2XhppHV0e~$
z!_}ST24e-L{uKT6sr~yoh2esRU4T(?0sp<HUu*hUFF4VFLT{;_ag2P??XzW;Lb28y
ztAmRP*z%*T*}5;~l?O5xOKoVh<hgfxG`Z@WbHmQ2OS2~X)8X2H_)?rY%-ed^ea--!
zmJWKOQp&LVInk~M#PRd<O3D@@RvfK{BYV0e8VaRDxt_81{K0>@8v3uRc_7b+Z~EUO
zlSG{a&2N;JOAzi0jj_*h7IU2|7htASJfF8p%qMEDF_V3_Bwgpu$_r}YPc`T0$#u3k
zoH-4DF@#$z%)SurklhFL<q%#_?o9)P^#BpNG<8-IC#g5f3vR-jX!d(VH}^K;>A~pS
zL4gBbdwCf|m<M1QN_NZ?|94}cDsq552~&hq%bNKGcxa6TPXmyV8IXXlvR<V8H|m=G
zR|nMMT!95>U2!36(fV9ttA>+`2cos+=O@28BE}LeTZl(UL#B{QSEWYH>W(4DzZNu<
zObss#3H5&1PSlxyE>>KIBK2MQpnCUCn+SVMU)kYSu50))s@OQOG}aX|)1%ILSFL!0
zi#Twtj0df6K}NzL9)H&a`8R^9KS5SZ%hP~2aMGPmgvpjJr&&JNe|B?cV5u8xR{Q}x
zk33Q8TVHj4U+<M#Liy@-;>KancGfwDSxA(uE@5}z*>JslQ{`^m_%o&V^%;q!@^`pj
zJ-!U30hW~M2>fb!K-u#Mvc1jqXeEbq8=;cAXPp3YfBr#)30~v1_no{|6tDEF?f~Ye
zs8+Vzwx|eGwJgcMAwz{PxJg4eBfFkv|Jq(!ZrWX&Sp<f->}WW%cktB<d11go$lN|Y
z)o0_u(0HT)#cZlD@!<u{H91zp_)6DbbOF%W)STDX_U9B|-t>K3g|pv*4gESv%ZvMd
zlRjLp^;8;)?1_yYCGRkBO^AZ6)QN^9|7=;2D$r;hS>fx0sHRc7gz@ANWdmI5MvhaC
z`T>`<bEK5zSIl|@o2B`Nt7G#m(tE8Q6S~DYiS7fwALB72@n>dkM1-rMKHm^Q_6r(9
zf&9vRA+@;5s@q;qp^M&}3(n;B5y8&co#xJyiMI@0KBjCUoJbrp6m$04zQoDS%O5>n
zbr3SbDBvBRsF6LT78a00<kV4LqCNNCcS`iJ8m8q;yY1@M9$C;zFWmv$;VaPv)BdCB
zt?kiz?)JS|bMF@k-b~~1(%`KAL|j2#8CRY(s(Q<V?yL1dcW$*|1n;Q~mZ9lD+adb`
zDM_^fcc>j3W&@H<=6p#|Q?$;1KdXZ0e@)_;fcNBsGxU!)mM+L62KV-9G6d-t@lrXs
zrFKuM6B<UJP~|>CC{95W#4@2@hXa+lg{MYslcNVbcl=FiRZ5L*q&l}+Z97uct(oL%
zeT40}{>%%SJ3#X%q1uR-9o9q_nciW9?2x>Nu~Db0oFL*v+QVvg?#16x|J#==(2&S3
zjHMNP8xE|o1JwK4ZlTC__3M+YwPp7Il6?X;Dqyqi@U}LQqt&()mxhrn2;qsbBJp3=
zzr6eO{|p2+H?k*><+frjn~GiyXxR$D8L~#0Cu^n3cVhp9Y<?7C#1z-Ejbli!1eqgY
zNHYuZk3#%KIAG-9kjRe~*YV5hVaU>!%KE?fHgeo6)}+U^7f9nLzKRjI=1(8hK|jAN
zYYVU!2kDMB3Oh51=I5fR%&l9DG(@0Z@5M!Jth{-$x`b2z9WWjew4$1)o!6ZdW1?Q^
zlVZ6Od?0`O-AX^b2jR0t%b_WG##;S+-s~#JiOdGU%$B&@du1DEMbyzRQcU)rP7XhQ
zUqqvRoTp9PAzz<r_q3Iv{4<n>>5EP0fp4hfnFx#}SuO-b>c{28-p)jve@6L1JyOM<
z8Tn>3j=#W<u_xVuy_%kt{(Tw+1Ps2=%z?oAT2weJ=i--%w!^h3bYv6|$QbzNVN4v5
zu{VRtW|~H4snfa7vnuRy2pH0;832x={H$^=b<pBp-t6}j9F`voeB(5fyz3+rHI<!@
zrp7inERN5j(}5N(@3U`LT91SGe^(Fos0xB?Sz>0s)274q-q3Z2n7aw23$t2IyQ`J;
z=ZOkC>P_m-^k2=lV^&t4IV!eH0;{65^SLbnJJ!a{wdSyHY1FE7I(E0+zw50MDGWWS
z{thZ9584R)QSspUH}RG&UKUF=1;#2l&F|HD2SE*v%-Fy-qIb*a!u+2;^`?!lCLhiG
zpaf$?%hvYZF4s5jq4x0HzgDwb;wBlXO|Y~+C!{bR{LiPDnEzQmfzc}~(DN)SO+^@7
z)%a*DjC8q0L!y+Ccd{<^Bz&H|r^c{+A$|QxYo-L*sC*&Q?9i&A(%zD<Z<394*LK?u
z0hGrdO!OM6`Tv30qRzig2)0G_Zwq-mj4H3jI94&2Aw1n!T^rt&QJC)~b&ltjC@t(j
zvdfyT?(gk!R?+mvIyD*lRVykB#iIedz-_)#**&_9Z+lBiy0iW313@TIEInFaWqR{o
zX#QNCwWcEnPZ725Cbjj;SUB7J-rgAE6SYoL=LRvR?UEOB&y1KPoDA=x#hLY%6X|?V
zVrA%w`p=iXe|?F+^SwPzLxos#<D$k1aboaeZ?O#ocRpV?X4R4YZ3YH}0Ec<S*XEZ9
z^MChx)>ilTEnsqq$NKvy9`*kjc+(P2nkp65;rM#@)4vJ!1?S?zWl6?fYN~>U*(?PI
zSLNQtpgES-E}y~wor^o=BO%l-<oRpbc?_a$nES6Gf_vbQ6%K<8O3zWxin3E~&@=Kj
z5+5(o0ETTeD7(GlBY*XUTk4b3zjh@cpf*l_EtFF;9MubqJU%qVv!I-wa!@;fef0a{
zKBB02LS#IWkg7NXU0<@LHP7{T571zLvB~$B-?W>39<$VZfm-_IR#IjU=^_4ums;$k
z<E}}foD1n6G4XH|+>{7b1#f5$yott}H{;XvePQe&zdYo%EGPW@kCX_efPq%z)2U4J
zkFEGrgVKis2y`j6GT0m#A^OP|foAMfh5ny+(ExQS?lwXj_jRxA+;1H+NOOV&`n?`q
zV#c4pYp~#vi^@031)!&Y=oBgIvPIZF;e=ENF+&sPDev%p?ceL_4FC=IAr3O@FP`tC
z_1*al5Z^=`i~mDp`=fiTe|vcKD>%_Zj|N_K>^0UMNw3j9x>KwW`i{`qOlkF5oN1lV
zvTa$Th<EMxEUc7{B1Y2-OMWkyQF;(vxy*w6Usk3TJO|BRI#y0Aeq7^YUjksP`6P$p
z_%z<*qF+Ay(yp?Kx#}hSSurpztSVYpSfWnl30A^xU{Pm(raa?UTXkbnagb*4K6*%{
zOd1ulz<{g+_lJ-OOv3g-Y7`9RX6R4!pDJcf?FMN2^>|mMbak3w0%ReveLii5{oNsd
z{i)pnbAG&<lk;vxE`FADM(M2WiP3KN8VnVwFl=X3XB2NYVk*YvL{v-1W2|k=%}9uy
zvPl*!k1f$6jnVKt)mZgBpW_#y|EQdn9QB5PC!f7rNf=ZX&?@4&h|r1tfC|o>kVYcV
z`jhwp_6U!<Zd=uo?i>)bd3|R(^%Uk76WN})Q?GCH2?wklL_hN(K-4VPK`bY7!_DGj
ztdJNJaA&z|px;EqUhc*9dtIUFh>KOjj=(bzsY97ST?*SsvPIU7+$Nrv(^(W>y|6mf
zE4J3wR|SKLE3jpsO13=40hjTNC3DL32J2yNV6DehNb^gMGD{H4?_)>yqYp|N3s=b(
z(@ZTE;(8gJ*@7~AB(M*)|GajQ6xj6B3+iLq57;y><7H!HH%W8lphl(&9X@sXiInAd
zdCYbOn<AA1`oo;CR!i<*RaCd$hMwI%@F@9%6Df0%Q-kz{L|e<QMHhsOKl$RBs#uQr
zf{9T}0M*|e4YB)oG$y!(Q&&vS2`6;;!gcaeo({_=UBP@9Az?L-E+sbS>m8n|3hHEF
zWc3~ydq}T}crC{)b!0bxe${x4;6F~hZF$=6_te%gq#7K}n7c>v6~?f5QnU927A-v#
zh0vd%Ln~RxUCpct$X6OsEwC=SeER8S5OO^#+OG5aX%OkKoS&V>*~_>;yCKstQvbwg
zaAGiRCT8Be%qVmFCwHRb{in?iIFKp7FdBIybZzwqKKUlnK>Q~*7d<TF)}}&%<TkaU
z2`?Iy!e)Q^dOdkJnt?iVOuNSW<6gm?nCBTl?iOv-*>Q`*l4^ck!NA})Fb~(@N)hiF
zZC2c!MW|t)INtPuNUcfUX1PiuSW&xjqm2EZTs3bI0v&TA>&$07VoI$YpTGo7KY_L@
z4wQ`8lpeznDHEQ|u{R#UKy(hRgG0NJ@^$c%cWh@dg8hLLlL6UJnbn?~@`vY=2l&-3
z@)G?Km##CqpbCb+4so%=zMB?Xi}7NWUVRN-NWzvDE*jT)ZsR}yw7@#JeXs`jmc0k1
zL1|KPhROEPRkko%J<bF6%K6Ecmay6YzTs=f2^IW+ZDqqSSfnnQ@5%14zI{+V$~~>P
zLKcNh=xS32BJNaT+`~=lNo)JDCphJcalv!9j$bq3n3tyZyfH%2;-niPjlPmf?og*l
zE_a3wJ?FAU*K<sBA+~Ku{?l(H%_zM7k)A+zZ^aDl0pQw_2P}pF*dvKgX9BWBgw<Vf
z8e0#lVFvo9c`gV*upwVp*U*WRWI3S_#PI-2zn=RJZ7O#+6AJ86&Vr(ny#YB?9fjkF
zxvp#iU9Ktz<EiLFaD+wbg%NH3$UxJ?1M)}Lv;6V^HY~bijTgeB@0iY+f{RJW@uI@F
zvq&RhmH)X52*Bc*vjFhy-Zw61cWq_So;N2aBS=rUplH;mNxAh_3`c*eo>Urdm3e-o
zH0n`7Fmv{a@evfyo*Q0mIxAXboAH$lsN?x1>K=E1X@QsL>6M|sXTb3bZh`jkPRoF&
zuyo>M47kweF*#j5&i~Kh5I!vXiU+<ejyQEmyjjgy=uRCB21;H=>wX{dKQ_4|SotLo
z^n(kG_QouvS>xxz;|LINpkd?VSD)4#uXInd14P@F#61!04b6j&cZg%>D+)FkH(eIG
z>u7><nEfO%kj$wTnMCamDvRz=<38RL=3QZYJ}5xhMy}r@x3kR!&0pP=?+izG=XrQ*
z@u1Qy{<Jg+rADmCia5TUcmxP7s|O|lv$TIzlJ?SD7A^hF`bHITmcQ*c0$I^ul{XT1
zJ;5=1?qsH@{z%G-eAeq|?ZriI&_=xD`oFOH=TIseoEN&;rD1e)GCr@Edg3?Vu|?w?
zhT}ZA4l{m&%s}{faFLtN47#6u0SBrZcHLIBWX8I(G_*vNOK2mL4pS71w7#snWCx>X
z-+zVo-GV&u^54BT12rC<5Z_RQ{lYX1$0MFI^ujSy_<f)8T$=kaHEQEqXAhO{boQY*
zq9ty|u-)+I$a4HV<?8W!(IY(*CnErZto_5!_hsGJ&cuCv4&f(=zbDf_yBTawGj{&J
zB>G@wG|8;aK8~)ZNyL^VXexh8H(M8LB_PQY@zQB@3bom{VcHGstdaYdbzr~@M+*@^
z!(g6v2L8<xZE#hAPrYk@1Yg_f@2KUU!rWtIy#9Mq0KE_p=)?5;PgaY&$heMW#=$8^
zV*rEbOafsyuwf9`1|{rGdk_Hg`OAzz|2%4fkV1C&fAJiC5E%la(*G~)Fah<yccCSW
z$6GsA<TMTXDv^mLXDfGf>xl>Bx}=s!7NdaOsHNJo&2rO7hS4t(keR!7O6L|lPjwO=
zFE*dil)$JO2Z<P3PB)1t8vA%ze<#ALt#+`7@He>{3}ILado9RgL~gOAPf379>KE}%
z4kZliBU%!ls>c42(Ard=Sa;ahB}*E~&!$sBmXn`7xf?5eNl)V{D!Wl2y28IRHc&@C
zo|bb^GlqJ+{vvP8B-*({R+eI6NW$Riw}<t=Kv{+_7Tgbhalw|S4QR$xbjgbO1KPR5
z0wfFNo))s=gyv31t;>fcF=inmRT?rAybA8ufv>Qd6&yv5&--({C?C^6kp4*oms!b6
z!KXTS@VkisKDG6OZE(F7S~PzS;W6D<(yKZ;beLOA2Ng7H7nMyD{95QYUtx_FTK1so
z@z0k~TXw$>*$mr;lH%3Rj_jyyD2cv9HEFEAw#5jUcV>rv1a~pRW#M0s<iFTHS(>@{
zcebIE*~rFKy9WR)i4`Strji_;d>DWK>{RpOy+0_IEth%%@iIzXj7Ezlr;*>AUebKb
zh>+(UNHNJ;I84W_5;`(a=Z|ki%i35A-sX|IN@`iOo02_n8E)q?4%#cJUu-U*;Z}Eg
z%i1F}>%$Sdn6%W4VIFgB<})Jk+P{wWOM!vIc%T6xt!65--^adfzTcqQrUcIQYf?>s
ztB%ZU_rDr}#8WUCf9it^&<1S7&g;B0J*;+A@<eU?;7(^R%oVlrfCPtWg_E9g3+0%^
z_=YO9{=wM*>I9O{vtPl$A3lRb8uTwVg0_elj*sIvaVR8tEOxBd@HGetYsacfY^?*O
zNT4RTpe9eBAkRrbc#J}D(`P%~1$D}@%L$uy=9es`yFFY~Px0B&A|Ki19Q6NfbmUbp
zj$eE0=pFYcSB{<s|MxcQt{gUf>#VFWbJq@T|MxcG;Lc{xo!sl=7pq{96|o?hKb}}{
zlOG&b(-6evsgPj_<Ou%HmHjmr6rFn@oE!y$=qaEALnw&fV8DPn2>5h*LZUVfgHDZq
z9RdI6Izb)#Kmqz!z9$ChY%HJ||8@L3lNXrp3oK~0aG)9gcC-b}bPWN!Dh5LRQvd1r
zcc%Y$6N9}OZQ%1ucsao!WqynJ#TG*RpZ2~=`nuopmRx-V$wP-0998oNI18c%<To9a
z>!+QTl}jXJ{5MymPh&I=t;n3Vzms{L{h1ynfYl`5(SctL#D7R@IOH?xM#ZN^2&K_M
z9@w+X6zj8IX=mK;5|>piL@R*)&&2kZ^EJMupoOW%M|Ntt5+xPNS-07Icv&3#U1-?D
z$)tTDF0TC3f$Q6;%<`V!snz_HZ90=l4YxpeW-<JXy56;yhLT!)j{b1l+7c!V7YF_y
z*KoPK9CaqVv^RHdiF_pIdS9%7y=*R{T*zR&(d+PCzzZG0uHH+cc>QcrCE1OGERZ`9
ze$`|cCGtxG*y~7bhC}V^U=K!}BST$p$b<WuPqUJ|rUQe`)rh-!0Ipl2EesGNDV<x4
z2wZRPNR?52yC|pcQ{3Plpfbrd%&@VpZjKTSIqdd{x0@O)f!AxwsatDFb6&Qw^i0y+
zdmXwEYmy!YSqSHKe5?IK-UPf=f2lbDzTI`<JkZQ+DZv790A{wzhK)8u14x6OPij+i
zJ(p3-Lc$(4ut&wVyrcNciE>Nhy5An&`PqA8EM#<+{T8(jj>*-%&No*!i1t(pP^Yzy
z%N$bfY?oW(kfWVj?{%@rr&yum#wAAcMJ!ld0@8qbuPOF#fJ#Vij*AU3ey8RuYlJva
zYb8{Y_iV=k&K@T3^P{D^h>4wQ8f^8MBowkc3$8hzW&&BtI#LSf72EtrJuLz@Z9YI}
zFc3j{#(ci)XfnsY^u&GVhm(_2xq97r2G7BnR_;s0JE7rMn_VAowEFZhGmVYc-d_(5
zW<2c|@?KH&aDYf1+&$dmub!p#aRR5mKTb5AUY@ZApj+D7-0j-u4VUe2kC$C8pwDB0
z#B_)ld9(6>NO2dkZyv8DahbVu+^Jp6M}?Ljfz);Gn@&3Jv)+5!w|pEW+g{fTpYmgy
z7)fo?qw}{oiLsHbEL$oHZ#1BSe!6k#uU4@62K;{QTj&$IoQF<V59_`Vpe2~@cRJ&=
zzK9f%l82IZ4@$Y3HhNx4yY6uEHNfE;D{^3aCXsrCJYMv0KnXR%z^ez7go2Dr`Ubpa
zAFk`np%Bei2>DL(iavgP5-;G}Ge9h%<w3?bse^W3Gocaezab^Z_7kf|ZZz}Viq9;y
zLn~;2KuXJtjg(5z&AN3#!w@=ZZ2s|ss8|&TIsEbv&*bWkQ3T&av^ZbRCIJ?Zb*-F3
z<D&=QmSD~z8{wzu0Eav$hxY$gIv3ifm#<Jfb2K_O7BM0pNCqqP<(Kum>)SHUG9o>h
zIE#RQ4iyF-XY`PP#ZNY7*Ns9l3W}hB06MkmKl%faCX72m@!-oNqobpbSa9jV!2@mX
z4<4SLQ!A~lytg5%x$^E%P*4~mYU=8E4c~$C_mk?ks(DjCfBuYrWlIR@j95&oeb#e&
z;5K^kvT=V2Ysn}&y;CL03L_@i+&6x4a*iOVr_?>mUnO!JeR$oxuJ6ppJX}2p+QIR1
zjW^a(y;8*nZ1>*yS%1i2*bUwEtd%Z@@h5)TUaWc5UWx71+}?&o@otH-`slmSG4sd4
zDZBrZfTE{3F;1{prcSNt)zhBbS*J<pdS2||fRJXmmcDMLb%{P<=-EKoq;WlgWY39h
zyT0BwG(0%6bzMbok?7#)p|P3yrx^U;dvdZ}UcuvX>Sg=$Lzs1I0;`2}iGTc!Nh~lW
z`lbqTGfE+W{XM)MBQ}cS3E<_qLnnHKuzI~WQd3<m2U%ZLg$uCxG7a?3;Hp#SGa60q
z3&WNqi=9}mB7k3=-7)PToG81XZD;N;Jv<m+=y2fW)>t#u_mdbnJwDLfI6ISJ=QuK1
z?v7JKdLvU6b#Et(L$_n^3d_5RpPeFJ<5pt+I0hF>M`S#_Qq2tz=1tkvr++gc?6oGR
z;<1uqn!NRWuaIt&9vK@0r4L3f`~A_`V#OHMv*kKkKEAh>i*{j5eEN%(2FB^)r5fYB
zg{G#aZ{NO|Nd8fId%hDM50*MVU8)(p$D&tvxY-{&TWKI2Nae6xmKAury%+oPbh6rX
zlP{Un+SZ28ZdPl%)4#jBE1xIY*VmWRI#;2EACggAY$_uYnUsW$2DruE(_N{zq?Ab)
zOyux*ybMMmCNaIay81RkNI=lt-QDbb!Edq1REwk_9RD<y#%V9f@l6;PUNfrM$o<*%
zkXoX=u!)RexBa+*8P^Wt_<r#R0I2tAY?6jP0zDvk0KbY+_bxDf0Wz|USSGLV4uc*u
zx=F%+Cmgm`($-gBNgevGXAxyPMg%ifTFp3{9U5;bQ>in@L?S<L=_%#0|M6}r5<736
z*)4Pnwek1IF&3)Pz}suPHzk70<C1QsV|vWs=YAq;nkX8+P&hyc_R9N3%hOvu<&A4^
z;Af&bf<VoRs)6A5Nh&=B4B^<u-@(`uD3Qomge>a)lyF4BRBbBtY+^Z9Z7$v;Pc2CY
z$n^QGo@&1M3&TdX*v1jsKvpIXhW>+AJ3@*&LB>tgM6~O@1|NJXyabKgwNRH<qtaXk
zent=J%|%N&l+0XH^iPyD8Yet&Pb=VslXy6IV#I!-gQabg&LtvqzW%X$hgA2DQ(~wV
zM^6DGyb|=%Gl;?TyDsvqQh5Q`TEkDSIwc|TH!isi5dpLwYKd~O5`1aFx!_ORSmdGo
z75&s{451W(28)oyjFLph5^c?7`cfT5f@q8v!hPW>3wTs~eFfB;Xhow*3chrnzQS9c
z-2rfi7nywSlU)`|)ll@`zu~lrN(8^>o>$2a?%HvM_MFPoCQv=*{fxkLT#jIjZp&Fr
zi;3CV-Mw{#@0b1dlG}EAdb*y{W=Cs-^xC^1`jY)6f4+^*g>gs)q@>ocQ_rtbu|cOJ
zCGP?$r}qg*JcKvppqS+lLx*q!n@bRK?ZV&)Xov#T#j37qTs+)u>%u$pJ&<W+i&C@M
z(UiUlXy<Ad3^9=fh3R0KDp5EK#NREa{?<E<3*jH%*W@pBi7~<f|M)nY`cT}umcd7p
z^H^Hgs1x=H@RxWUxmD(dC&b|9(tA&1PNC=J=Ag)fA3N{}Nh25B)V4dGEv~m*8Bb+5
z-yMvRmzSs4sBd+-(gUTpW``rc@5d7YAHAUrK~2rg>uv4}6B9t=iA-z;&BZo%XMboU
zXe7LVS<AiQgu!EE0)UScEd|AXh9l<O@#Li3tcdwS1)<wYv!i+KQi*b@96%6GT}|z5
zu?oVJ<hl3q@)Gns0-y8z@oLA{*H_@<)!xG5^!@E6hJ{3bWMV=rH^c)OZV&Vn6m-cM
z8EcigJsCXCxonblsl!s642BP7`s>R&&6gfdsZ!STUk5m=h_K88+Z*ja2X0u&3gOwi
zM4J<0-*ZmZ#DuyYuw?!*i&+l0vPg(7UGty^idp7$ex~|83dnFhtfJHS2DJ#+&sj%M
z$=WBVo1_g(a8D)8ej&A=fRisJsS(CS@6lJIzQU!ET@b(M!fHI)H7n@e<tKw&gpQk%
zFFdA^^`&NmXVLp{Vf5hWc*(fBVt`g&i3Jm_F1_z&c?FvL2lRR9hFgPAa4tk-PWCSf
z66=U72M+A?u=qDG{uB8U^@=?D@9E1jxvqp3rOT4f7vURDg+3Y%TEtLL_FrA?%L?Og
z1sKFwcO3cysYEhEInYt1TghkCmmn9bux)XJwC%Wz2ip9S2?)o2ony5UA1A-f&K~bs
zc9Z{MgOVwpFW4RNB$VJ@nxk#c?&M2~GPUyeFx<+gOG$NN+Zr`B-e_yarHS_57{aY*
z>{A$R<C-6tif6CXLP5aN8jKvDf(7iZIytrUT*rL0VF2jFinDA0XPWUOIwA}9=#dsC
zbB>m+A#jw(AP%LQq>tdulV;DT76Q4}6|mAC>fq)6l6&uy?u*xbqgL*jkDjWIw2_s_
zFeCZYdnq?EM1RV^>a!-JG|ruH_eF$$Moxu9210f!cN{sgz%OWo)QZw9G$7Q{@l?8L
zg5v2CuzLi%<Y&*NYf6(JS&w0F#)Yt7Un2(9_@HC^LWG@rHWi$sU`n#fo3d$1y%m^G
zQ;;#(7*|9+tH7ZrL+gEX%27|n9?uZlsa9Bn%ojtxdlHTC{PRXEK>b~Ap_+}L;ufkx
zGy<<sHmkm_j?eXa7Ze?l1YFgoGZYvrTm&+Lf?)9VR4jSQvYEW!BEaECit@!iYv;((
zX*JKTt*uQ>eZI>|<FsBcRRtC+7Rscufq<j1`@^{ibvFIl>guG>EuM%`%#VOq>+y6h
z>yJ)#5Y7E=6bE)xi8|6`%o=y?Xe2Zq{fyO7TPtEN&t=-cK-4r<-(Epjk}c7-Dy}3W
zkm4mE;x8nZ^@|mQE-7Qe>QEm8fOsd3l|HfVB)}+abXkpsX#I=pMhQCf1_+6qFL3v>
zzYoW-V%TR}L*II6Hj!g)af7yUW-D;5fub?xCeg`JkoiV86#7XnNpQwci{Nn;B>+It
z)XDVr{W<@qp;6AaZc@Jtks0-sfJRBTkBrKLn9)T>QK%J}?zWe>q^4wdPI0asgMNPu
zY2W@|G@FoIaD@%k#TN{3N58JCOSyPs1)+VL`+1SyEvkWSaE#IgX`b9SyY{a2(>`W~
z5VBW{W0qzuIKa<LY1RkUjVbt<zQ`H2%sXC+GX#X6ljzp6biQqB`|-lv^<?sUh9ZXK
zQte8GVbPn-6i7E%twqbwYc`_Nq5)>*TRdOtfj~wyo?GWGKPVHf<L0q+uA3+Odr{ii
zulBaKO^zpt9UUJlS-DdtX7gns-QVLVq@$yAWL<9$KT8moH{0$&vkdn3e!s=b4-dBl
z0SC}AWx3i|r42+uMy}9maawD3oLU};BF?iSIiAk(l$R2hl-#*mG|KjRdln8x7C}>s
zol!CZcV-`$kJl(-UzBWm!lTiVxpH}om|E{*h>hENI-Bp0BX~Xj!Kjo$luwlkj2xk#
zT}ih0w1kwMVK4HhTA1WBOFnz@jTrZLF0^Qj^}niAAI}v+%8)k^)W{Cn-Tpyl6T4`k
zTr+ni&b5rn7?OnfA)opzpIZ`?_SkLXZ2#H&u5R?Ry8^k8-^e|<_^zla3&b&m)yctf
z!@ax4myAr)O5SJfvg*V_vPt<%IxTk|p?w3!@QnS2N4@b4=No<dI<jw&i`SO<yZ7x3
zO%l!h#4!?3bCWs>D{3ArxlFv`n1>W@Y*P;nH7P3>m^ckUvw?<0oB9Xa0;Yng>%~h0
z>FC0uk_;viNl!|ZJ*$xp{^61J@P;Rq@lL`al21!q(Qjw(o%$*Dot4+_3!jnbC12-$
z<x|-}hfd>CF?_S(N{`YRdu0P1RwC4d(p+2;{-sa?b#jGDGZaEB&#F53A-{UY%DPr!
z1N4yHPE2TciOJ2IC=!{O#^-reS>N-Nocg^3gNRmo-NaSzu@~pfcP4nLzqlCuZlU7Z
zkjrK-szp4;!GI(9(5?peY%-&t>y|zKHQbU=^jkuO6KLdS)d({@)Jsb3oj$~tWJd*E
zgBE8;7^hV<K(+tu0z`E_dVx2{>EC?OFc6JwH^rnzd*&G`DU1m{ZlA6Uc^7`5&<al+
zI5>VHC`)ad91pPn<zp`4+vTVNC}(B6WwqR;=Dvi*{85)x`o+Vpvo@|cm>4_A_y_=r
ze)y#pHRk1|#c=S>5liF%3r`N=%yg(Dsl+X{sO*TLjeR^!ES6QaV2nG~%Bt&3bMAXZ
z{)0fYuKM#K&IIzO0&=IuX>7nOzL-2=F(&+b{|2|*#q)3iEmcwOVYA26_0de>$Nh@+
zkM4S_wdWIH2Yg`n&A|i@4^JL66qNC-yV!kD^W=u-1L3h!9ywJk>4Eujt?5R)=M+Pp
zXap!=aGy_hc5dDtHW)SP%Rq41dN(H>e)WjX96eHEBHOQUiZ#WKuYa>JxTWEm-95Ri
zsDVzsun+~&;R4SjF}~2Ul9z*AEDYWZ6C=Bt-3=a7h-OwxiOi<AHtxti$}pP<3_gnL
zZc&rae+!S$(FGL^5JnJK|8%@KAb$MCnWJ6A(A2t*M$V%JFx@k!>G10670hmn_Ib$A
zTcZ<_fP`5#7&rMzi;#k|!P7fLZy6XRa3!~YLT9nbAeym4D~m8j*Cb2vWYwC|^5RR!
zARG3)>uZ7e#o6nCWPybpTO{B3`1tX+d_Azg^t{>?6+}teo}9XxPq=I?GMT;ceVdYb
z7dH|*L*`F*tiIEfG{c&6hoD_MsOIfD8MW5*td?^BkP+K$P(z>C`$brqeBJaIasaOu
zX(WLi?0QBhafb|2%aRCqoKBN;yqdf}_)8jo)cE=NCDN%)gK!d=P0{w-`^&}8sWG#n
z5>hT!cmjpmr<;SkJ+UYv5zH451k+^T<m6nKnVF$-we}8`$QPmpYPaDjAMrS!XWuF+
zDD>*xSIgPiu^e9p1_mxIE@nq&G&ONOyYG+U*23T2M<0K67cL1lKz2l;r>7r?z<=26
zg3yQe;3p;|L~Hy^Dw+*Y0G(bz;FOr3Puu|a@Ejd6olX}9fzOr#Uo5M+2xAt!w7`S?
zb%?gq7Nofythbv-xW?x_l$1Rz1Dw$?*BZ$3caPj4zxvus4$i2&@hKD+6x7ydEOccW
zCC9udEND((+J#8w@t!pPpd^&mcA=@M>VPwA7=<(_k=n46)-(t;nt5P=I93C;#c)~3
z;vFYSn;YIpN~T!^<{Xtg9y;8i+|f1M+Inq#G8npRzsZB&4XUCq^RhMu&dw-C84uRY
zrgPk%tGxQl!Z%w>2y(kP){03}-nXpZFj*gmgo9~x<@lZhWHh7VIMs1`zFxKnxWaJ%
zz89c*+?a=Cj%wv!u4&Fg4eZ#MFn+aTqMsCTa4Tvl4^Yg`iAG<IP#L{AXKD)ENrnJ@
zre;9u4ggHjFN#<1Io%h-x)ueyC<&)c!(!!sv60n_;~;FaIbWZNKh@+<+BrC0EVo^I
zKTAvg9Y{X9nuGxe3{0;1fLWkepY8#fY9S+tWoKM%ZHawUprPhpPbt}#{=2vxnc@|z
zG;bo9_LUHlv)?Thu8Fe8kguMMCZ>~J1pL55KwR5gmrQii+?^ZA^jG<uB40fPp(lmD
zqfIC<LTS>`{xruOYRx?D{PaNd*H5YXiuLJ3@Rav!S^kJtDGFj~H5-nw9&(ijo5!k5
z5rcw?^J*L=_JaDFjFs^#zK8A2DlR;r+7?da$5Nk{2kXfn-Zda4s|}*g(waEU7gen>
zq9&fc?f`t|z5(@akriTh2K;u){M5Y5OE6XBp%z?51T{SW>vTW_VZJS;xctL|NoMr#
z<n05iA?+ffdP>u?zEcWiX_@!K@ttklz~^R3Jw=wekJrO>+QH_Y=fj;SL*)!8F>eRw
z8dL_}EL%pkxuz1+EOo3sxy=$)U}W<!)nFd7$B!Sd$G=FIK!gQNo!#TnX4gg{pM{o|
zmYTX4l#d55>FDUxYfXH&Z_3na<OW5UT@S`HGPoUu2(QKCDCSC4QHFGkjP|p<-*JhF
zqnwC&U2N*i=AxV7ApJnZ9u#w=h90CSdn%jX!MCF#?|X#lNoyQ)M)d9A?`BpO$IT81
z5=l_hi1OxVf}tB7D~y;QES779O;z?=g&{QYU4H0Sv!Bfmh!`K?xrGA5?}uaXOX$K!
zV{zMo`t5a5tdTo4GKQnP2_l=LTOd&c@+VzeJinE!S$5jvsGKt`#>TAD3({3X9!5<I
zK+1g2g@qS|9R}bV1zV>%JEWbHKee)QRWh&qr!c6w<6<0Z_Dl0Pvh4WGLy2MizPF6U
z4}qAJubfg=R$6QFz4Im<6zKAbo?#5R`l{RU!cLStX||xVIqvfDAr?NZJXG<ph_#1T
z{V`c9gx;ctBQf`iRG_*3UjKmm@xV(vYYZ+E`8CdeO`aIK@721VEYpn+e|^+w?n84y
zdjhAPZ>c{uxOs!#QyQ6%`thKhZcrQ)EMt_bp*KgM`U)i6>h4fnOy^fzH?2*OU~;~>
zS*h0tUv@|YAhY?=8V(+OoFjh<LfU#505Wf4EC2?ApTGYCP$!C8mzkNF-E3B{|Nb5m
zU~PTz_VSQ-IaQ@Ew)-1HBnOqr`CQW=?&p-8%Aar$M-**hhbMqXGWi-xK`ItmRs`<M
z#nY={=ZSa?p;^od`HGbipT9Oc;VP)Lq6+S8_{W$~)%Hf28M}LL&429S&Y5;O1E-l%
zcP;E2qE*q8MFYRDm>#F7J5)_agWeHK#7g>)S+?BvUzXUmGYL~kddhkXzB@(O1MB!*
z$czf|Z>c{m&(7M<MfWGyiWAXf79p)ES!dP$q*a8sVSK$%UU<#ASmmEwdD#d4X>-0|
zMssG`Q?W;_y-c_-wN#nqMia~2cV3V8LA>7o2|Pyb{{}beZ*aFnCGtIyBVSN8JGXn9
zO%tsHq`C#=?QZU@TBV_hcx;aYfHbF3u-}0l+fIB`f|5T77FZEs1~Ap(6=%-;q@z&o
zn+X&b-LzU&gs+|qeCRtOON+#iX7>3hfL5opxa63P8x4e+?Y0d*6Xsf+lLJfId2<_X
zM4qH{-79mtC$n2$=R5>ByB3~u1#+CqP2GNdd4Z5?);!_o5h`U`P2N@97;~5@m&)rD
zc{?_z+;ba+i<+2@v@)wHOS6|J0z{~O*A8KN4>+#AGTM*gi6z%#(QWR?Pu5`j{cQ+Y
zk7=i0=B_T*X3a=IHU>9<Y<v@^FB>f&=NzUW*Vw^1eQDnIdTJM{kX7E9+%!nwB@G~q
z>6vcjA(<w?!_)B*o$@THZ{0V@jJ##mh2Vf%dlv_ya=C}?3LbZLr=NKgN2R!~I-XNP
z8b$U3T$0PaLtU!{QIyMN0sjycnomwA7kdqM5>br!dxh<W4le4vrH?&P1uln?UKAFI
z6is|%f`%6sQH{JVD}ZKiSf8A+eCSEHZXnB{P@>ybs@e%|4C^S@RbMDqg7!&j?^G^V
zzq>t}iHwYNzE~IWeC!EEAr)xxcw&LyKkS>%myD^Zv)*Wz$>2VjFBfK%!+r*797VuL
zZG1%N+6$06834{7dQ5el*Xeb4JjZT6|HWFE8P~B@-re^P68$&xBu1S`4nf_I2;O)_
zU!Ihsoi8rjL&1{O57tj(QNN`WWz8&K+=CY&t+F)^O7=b9WgZV?v>~jA*qwSI@a-uG
z#kE|xOLKsba0H=_Ci)q^{DaGxu2WHI%f|?LH_Ac!{%gxH$q%ckm6UYEm@<By#kr3K
zvkHj#8>5jZ_q@2;lUA0zQIcfimo;&@+rfA1iIq1_+KK{4fuA#pf7_7W7gV&T2gU@{
z#e=Y{#XkKpp~ZPWGX6jm-^fKH|F=O(G_SiFS65=Q?*;q4H{Hg#%d<kB72TI~=y&zW
z`u?lxFKIeav=&tqDzf^N-VWwR#z7^cgKyNO3<vN~7E%mmVoaFuV`e3ZQM>1|+ELSN
zV=8_7Lx{ZmN9H5IJ-SmYljWSyp~5xEw|{32_2**&K99#qDw`sWpb`d_k`z|s^Tn#2
z?Y=Pea;&D=0_ijoQc`(mq~e5*j*hEBWu=2csT99^5l4PX$^j6acgb2^$?`+{MiV4u
z+X?1`7f2zU+SuIOZi|x69C&$YlOYqFZUK_f^2DM*3gUgc=ZoB?ps&$(ZwOU;c9bA^
zu#l9xda4}N_pV&4wdSIW-_361T+)UvD4Gyi*moU*hOn=sEL@5WO<DFIoKP~eE{1ew
zgzocIi+633Q|IaOFK8<5IK7+wPUZDvus^PssiJ<}J3-QkRXC>-M$(MeF=v7_CTW#k
z%}5ot>%`itq}=BaJhoz=Ke!Z~ksHes>22Tx@C2?Gl-8V*Dw}xpQEhnnhgu6B1HmA%
z+0%V3BK_<EKC{3ae)lk)sXekA-G7abP9!*U*KutWUNa`E%F$xM|8?s{;W{D6ERuwg
zHnZvXcb;7S?J;KUIYM$6q5~&Jv_=j6wvwuDvD;T#rwKl_%9cy-xv^Vl4iq$gcbMQ{
zD4^n(W8_#}>d-w^dxkzLw42-7$nK}zbdGvOjU~@OTALzu(5I^bxM}yJ+I4Nt2UuaE
zAV_C=Z}yZQJn=lCk(Qa}ZU45Kx^DHsZLg!U={Jguc=>B3%_Cx=l`3P}#y(cI4J4kw
zi71cSW}(L<m%lY7oDpY)C-nlxRK>Jhj0k!+tB=+D2Q#P(3euCQe=IDvdnqOMJMY`0
zqg$N<P2dI`RlX}^R_L?GL7$V96OqD>eizJ*Rol?Q*)|&1_T|Yq((4OL*$-Zp4=s?B
zL1SSTXsCY5n*mk97ZhiwN0wTMutbea<0asGuGFn^K?t|6HJ8?=<)?30#<6i1H+c;s
zop<NBQqwZ(A=!29wzx}g1rx3`D{}{lFZ!yi!ffgUJi#>=Ho~kQT9<cFo=Q_n6zGRu
z;#1*d7Mb$oB}P%r45x%|AZblid#ni)C=+wU#Mgr!Yu;gdsxwg1+w;ILzKN4cxPJQ_
zivp`@VsZ!)RCl(wN#cg&fRC$9_A3q60`B*8!9CwHY%mt5I0o0{dWpClsNnltU0qEe
zYEeb@|7sdR5RUaA;p_c$i8WAYs?K}?ol?ePtr^pCP=q9=xcJDr<IM?FfWf3y$-azi
z-Y!yOmQlOp{!rgsukg&LwW0dWhjT%%7wpR1-XZcm-<+bMKI5aHQw_8)tR~i8MZKm+
zNcqD3u1`X4ll*a7&nZjs=)&d{Zm+(bpM_c_OAal}YhoniWrsqJZ>M$WJ*+3qIicSd
z=@!g0pN30Xc=Tw|0oK6j!@&m*J{NW9>NwP$u5_R|ZH!s(Gx7;^-#{Edzil`Hf1RAQ
zRw_9X;m0v!OTKq~v-&0zn<sn2ZTSm|!K)<8a>}yIm;}CKBPN}i^KtRAbQ!LZ;B}X^
zX<9JrajHBa>uIjBJJDPZiB07;5wlU(U145(?9LhE7MX=LH85E$8q&&K=)O!hvORv#
z;;wpx%a|1Ns)<`dET>{tR=Nz^k-9h=8wOOpER@L*s&G1*CRb{iHDgyOZL&Y8w^(8!
zC50%)EGb%FUk4ScG^!2Ys;QK!-0zM96LG`}bIC!Plkf{36axUT2vE4oAg;CBlc=>#
z<FEwvjg5}>iKk^|{=7#|MP;&7ZD?d<1e|~f?yUl)6z{h?RZwTpdyMLam+0#6hYs}j
z@d5i24w9^deIZ2PG4Ryu%-BsPzr<I?rWdeaG@=p@-9;g!|GDX3B-O1B@|vb4jo4W^
zLdMsN5}-Mi55sIsgS(zItU65NzseI!1DnTT5FJr357~)6WwWc(BIB#?KWovL%n3T0
zBGx%=7og+hP*t6fo*kSqIEyIr{%TW%#Dh=Gi5VNg+Y4oOOl8)JQh>c{>WqB%SYI&r
zF2?n-68jtSylaBrtG*~w9x1VZ^|yitvDuZX^=vxoRg+nk>t{_gW5y{iiTj{Ej~a3~
zxYfKY&sK{??jdcx)>w@NMGy1zEN1>p*a`R4g7W=N>l14?TJ4nlX_#gB)w0JZ#kshs
z{45Wehw2?M*}Io;#;rDwSNPQeduKf8#uvcPj;h!Bn#1QeN|eKSyO~{QL>}q!;bK;s
zQ2)BVwTK%5&*m1*rfU@gg%@>3E2gvNUn^HHF^1$eF^Ws>DWwd*T@rJTK3miuz(nP)
z71MNx@yp6bSw=#RKW*l<TAghF_=?NFG_Zo_nf4g@!sW?HnVMWFVNAa__sYsb_l=!u
z)~=@^Qv@n}-GLUN&a(V1pPhhQ)YqN?4L~H0XEN-rZ6H)|-vgcmEEpFV0@ax1P;67B
z8ZKfiUUMA$yHPoMI-2ubo)XHKTNLR=0mZBka*~1dz#71{Rd}`R@4owXE2>~yF0%>O
zEy&0LEV$J+G7(EM8$C2jYkJ+n&!$Zr&#Q{Ag?I(QFi#ksUv4Ja$}n&P|J39JV|dn%
z3qo;)Smdwrh!4xZ%$)brF1;*?JtaQ!JkQeFEOet+BVcNssIa#iW+~qh`O?zS#s47d
zSDT5b`JOZX5W)P1CIP<B>5qe38xg%&hDR-Gn<$~R{()MtQZAu9ju^5rL2+tYTA17B
z{r!DE#-v?5!0lyxT}Hg3(~{e}&{A3eCJftN)zi|Kwz-JIz<K()luJ210EO}qBLdL>
zY$ds9%hO?k>aXe&RB#__lAoxy)}#9q;=ZEJUHlqkD>JHCSOutfpVay+<m7366=7Qh
zsM2qf4L%6WH#OsiQ-4auzwi|W$L5@y=nBVd%=9KdeM;`cG(%L?Q}#f!-lzQr0?LgZ
zx4yNbyl3Ia_pB=bLa*2rL(&&?v*S}R)~^zA=T+po8yT(l-}VQ*9E7D6G$UzT<Q;Mn
z`MHuY8)>U7NQIaem$rzcd6!JT?`MuDJCV_Y&O!?onmG%`LkX#_^6zy?m<<|^tg~(6
z43kicS8e4`vNLlx6UJ}$EuCF{FBa9ZQ@wooBCZ&X!B@pqN3I};e;t}mXT9FJ>C91o
zHiPF>T8hTBT?~O(Jerp~Kx|`Q@BLMFR@5I$W_l{w*3S5hv14LCD?ekx1}vdfk1T1n
zE#ZjAE4cv{q`K=9HF9S5%E`(3@DJi(Oh!_f{RDjj5hsExY}*C+O-PGZ97z&pE7;Rf
zz>Bb&|2gw+m5jE+AaXrvcq?`%^(V)5^l76`<%900KBoR}h*VUN4B??ev&zm`GM1Kc
ziYCd<{Ips|oA%-UIDcM<_U;x5qK4<=6YF!%a|h|h9!Za$dEZifD}?bE-&<&HOar;2
zmSWWT?u@Yq_`3>XmYRt)&}(WOgGJr4&yUUmy4@u8f1ru+RVJ>-ifV6Tm2Ovv(5fr5
zt4((C=gV>K7#RHw740<i77_t>Ce@z*WlA^HMafoYwM}k%M%#o1G;3uEDyaILlL*q~
zC`N8~Q1UU2%nBYHQs??jFV6#zuxpN=Qp%3DHh&FgxRC7t@tfFJzoU5H)34FmBpo1{
zVr;(OV~rV=dB?^xXRDR#mY*~&y?z)35K=O<V&mdB7&k>vY=Fw5c6QgGqA9`W&sCtp
zK?pi!F1+L8@)`rFF`))ui|3C?%6$sSf=j&i%68z1Y=)4=!sVOMRHSf2&xph(Nx>s`
z^7R0rgqj;i93$C5ZQV`X=d#vJPY$7wk}6wN;d-G1eoD>a0(@6-in>a##GDLIZbkj<
zr5EePZ-{<|Nh|~B;#*;ZP#LG-7EsP;${j=!J+re*IvyI2$#1m2J)|si@2ovB?C)~}
zO%YkQpwG5m_Feo_z4&`JHe_55#l05ht~{8Hop@G$iOJjR%ru9Qp7&DA`)W@2edSl>
zDF;>2|6EZjb?vZ;+%?4jqLSD9>oZFmBRU*(etwI(yCV3+RhM81fiiXDFt+Gl{PR<t
zVdD#mqXnqM(R$ff=~;PHT@fin^xhM>TO+%BW}Zxhf2ynwzEG8T@g73ltZJ|kF~Emq
zb)gA7E^@j{gK^tv|7g_y@nH8x79{glqX$%5QTIM?S012KF}V}gn8TgTjZ6q>9Ph<M
zL1*bm?Ia}FoX<~v${Bdl;_Tld<0{H@JuoW(iR)HvN!HoPAk1;e9hP^|U%G1HQnRvX
zcDtCBRIP7UTO`ik3u2+h*5KNog;^JiL{-k-q!%BZaQZmUfFJhhGvgs`Fy}W=3}U)P
z)n8aj1wSOsSEi8YSuvFs#^A0}mis>J2^M<aH4VH*=4^|kQx<0nilZ8v8g4=GL_YQF
zdTjpMDFWNAwQ)7W05t&8%L0|w21RlAE~)?6#7&Mbj6qpyMLC9Fd}daMs8mTF;uutE
zhVfkelXqlFUGlBmg45%K`@rG(_N`c%x9_CgwnT3r%bUY}3-|uP#DS&nRUuvW4=?;`
zO66b$LjA&ny@)bU-$u}MOyom3tJjC$9E90qv-myPY4Op3pmHXtT7PLzdGtOUzZ8cv
zZmj$e7xZ@OzvYstaKXix`$f3LDEed)wMBozj;A8L2hclbtDQrnP~LSpHvfk-$}l;y
z`^V~rINAQz0|~~Nc^Wofh%ex>@z-X>A3wj`aNFPK>|^)Ad-`l%XOxUPaX28-sbfno
zR?bw(YRZf_PT5gz+MH8ZlRTm2M6Q2&k5K`K%RjU*qu$A+R4&4dLV}?sG>U>%w|Bu4
zG$c0j(U>fKVv;V&M1`>>)dDu--EPl;X?KS)XTff6ksP$2YNKfUtP`$syOQ!pei{en
zAR3_1jPTh`>HCHBC9+fEA!1Mg?T%G%+akYHNmNaf&3VdgAa?bh9iKZCa)JTFI!C9&
z!2?5q(6w4~^G$#6A{~-DfNbS^YRHguTY3vrYNBX}%U2r=(FhX_fyq7@W0xuDJfTdW
zLLA`zp)VM2$<?H(uDH?o$KCswxvFh_47Jj@Rxt9>^8cXit)lAK!Y<K-Kp?mi+}#q~
zVFST~YtY~l+${vR;O_1gAh^2)cM0z9E_a>tpF3{f=k9*kFEwCS?W(mtnR9-#_9NCj
z$9tWRmx?s&%FlS5fvVysXB-~QVBSHHs^gnFSwgGK-RT1Bfw=t}e#pr9geb$R$q=TJ
zbE?9@6%B(!;TCa5(hVDJNn?t}o?!4=QL_qT^`uCY{fBoAcsICfiP4Qb!<<uH--c^`
zu4!U?e8*{)pHZD8lt0d{i{&K<3EneRCOLSJs@hck=3_2FpHj2(tL?m^HJ<%guTfS-
zdHL35g{1mu_WT2ynFsotuCbL9iDlQe9}|kbPy=XaA_tmfyT?Jry>o8{hGY#BH7_Q1
zpij9e=%lYFyX;wO$QKt%caQJAMV_t1C$4!5=UAZ=GLdKux}7_Wy>5)CrD81**<XT;
z5q`#htlQikuBZ}}xjw3;@J)Wo3fx>rS`t;uj!X0oPa<43qs%m|#kVI{^){nb>>Ntm
zu%_G&U&^Rj6HAvh_<${;9DEO*WO;D#6M0SQXVpRcEg!DGtq$M8#ByqNTiXGziO17Y
z1(n0#-NPBo>u^=U+R~gUYz$0-E(KE2p6DUcF7yfHWw!uqJIQR7NL_Y>wM*MJu_fN)
z0}UdW<Egs?%fQjp+oAW;thAq6Zq&B3O<rXNK|XvcKx3!ey*fAGWHGX<rPD~tu|J>9
zb00UmTM6Jv=!|(Ep3lpLK<;%G^J}DoYs4izzL;oo`Ma4~g1e-D&C5?QG?!8N2qJcd
z_sRFi8M%bc2y=-)H=I=Yh%KJusQn=V(e4{9wyE_5^e2_vADe{wp8ZC8QXZ0*WaNuB
z8{c$=$?5B1YyU~MrBf<$!Ci52d3ZJ1_Q0hX<QkZ<?W}<odKQ#B|N2v%o<1RGH(^tn
zT3yJ%aOE<wsceGoNV*M$c7Z2u%txvzrK*ayl(Ja*$=$^=>3THZ5H@o5o&4dmCaj9%
zT5d>r<_9QhR~JU=@h8JLh%_<&@p5|FuHSw=>DinTZZXvR1c@Zgp7GeaYq#sz4cxmb
zwU0@ryOhf_ee=`8iP05%&9rNfZ1_B}NUq`^`TFM(;mCpk>T!Vzh3?jT$Z`WL%M0p_
znfuu3@ii~otjZ(`cclj#EcKt#oI1XvzpnG+e2EJcH;eN^UU8lA>RGOMVNh1i_{*rv
z=PXy2GrQbBtSS%a@F|DrZo`u224<0Aw>BQ^k;5(-T*971Pnyty@uYXb%N^UJtWyQ#
zu@!;{R|+Bb4!K$unHCK}$u7a*W_(I3c<sob%a3s76n*aGL#O*m`c@Q4{Hr~&?YREB
zRrL8>wJ=)5d##mh(`ZlQi=*ax0?*GTkQ<?k8yv`W0<#0$O|(h_{Fm;&lDT(Z?j>AO
zag=zs>8`{x24XI*_>ejHx~1qgwZh_bL)AXAsSc?G-4_j2Y?ArnDz8kGo9h1|$a_w-
zbw|f1GN;7)rRGCiP&~#o5APcks8(Y#xf97eF&d9QA~t|=;UGbJm}kgJT$F&-7Apl!
zZ==$uwTkn1&3k>d#I1WZu|vq7`*0&MvR8qPeAd*3U}V>c^x~oR^Phev3s$|CDzm?M
zvmdS6fEIyNB!KPP+$rFGD-^BSJOZRjn(EA26?4jX%^dZe1a8U@P6|IwDgr-(&DBFg
zw?ZVL79NL2_-VZ0A1wu~A=Q#af(Yz%(rUwyv1%%JZLNF`hPLFAnU7s~iR4M&BeS_~
z>*c#C#n^tc(YhrTX-Sj5k&eMF&D)H)VhHDB^hnHhJ<=$=o#wimR)^t<>|LJF6Yuar
z(15b4G|AEliU%5G`rkj3eiR<jA=`2yJCTlvVe5n|d<p6w4`bwp_SS;OiqoQ;?w6h#
zoIQLBY{%}uPza&kONLVJhjIB8^Q)+CQ{^>)LBa(Y|0v5BK1Ns?N}_JYsoza=!Q5LN
zG=v;91gmKch*_Q`ip3j$s(D3O^&$!R;{6({Um>ha^ph{CRX6<!RWUqUA$sK==UIGf
z@p!Ona_IOVCNA%6NeN6{9y70qR1u;+T$0_hq&{hap@~ZF9w&zAeE`~f9P-5MBwq!L
z?3sNe#0@i2WI-o}^>t)jxjIEUioAg$gxLjIUF)q45!{e%f$!8@s)aM>lHrkkS-Muf
zbWl97h5C4=Uy8;xQhwPo=oURtbyseV3V&M3OnYY{5#(=6w2h@^U%eU73kqF<{5=*%
z_@8dcdm3GTGSb4L#c%~LMX&j!>9^6-ij(@4F^kE;qG#t*vS-mMA5&2I22<uiiEq6#
zFa-`{dHJntrqYQI^wybQ3uCT?K>G%bs9LwF{g7JP6`P?Fb{cQ}<?~B9td6}+m+<0d
z)M)?@zgUSABX(XYB(U+m)`z21qIph0sa)_(D_t=Y=K>DX07mOkr|MidR#9%;J3KpI
zCmN~F4xU{mgG7nh;`)_>)%bN~!)`2#0iFg<kKtRwsjc@g=Ed~{eC=Xmcn!xxmk7#J
zcc!im;j$$r-|`F%<bvDMRyTH@E^ZT`W=-}+-f(8XyXL(Z+BgXt)+Xs$3s3J}?&=a+
zCv5rhfZ-N8vq2J@3@E#c4h!Ve=VGY977q`o>jnOi3gZsr<r-}FvSk)#B|RMr(&v|s
zx4i+UWq)-u*dNk(Tca=zH3IAw|9Vv2=A<Z4V4a>uWvOE8k339zkat}cY@LQ3Rm#b}
zq*=VMGncoU(LCvrH`c*j&g_PIZeUYoA0yPJdEpsYlI^Kh><4jNl{EfRQtvA;asSOo
zW#jdD)(7;)Pft(Qe;cp%Sr+NNMJlikk^cNiiTK-L`@7?%#(Bn8fy(pM3RV1ZG!qso
z<O@UyO`+o$`{%`Bz2eEpBriWg6H3#F)nTu&P;OIGf|2{m$}t_Y9zFFhpyfSvO-wUM
zPntK`xf{9QV~x>OeRD2G8go8W5?IEINLuW07pWubjKPvouszRtLC7V%F<J3dBIe}S
zb7gk1iSy;%9ZmJUgF_8N-Z%^*qL@N_bHBKV$M{qGyUEj>mvz5yEcBlCX_48yHgc16
zkB2hMk9?J>3bUIA*|&ZQ(FxoRoaXDj8xF1-I0(KyzAd!uaV>5UvOg`#BIK<F;NSB+
zZN*%~8wY4H-5kLgq8Ei|?A&+FbXake^u7Ht03nfN>k%Ay+93>f?fEhY@2kZ#y1(<$
zs%yOP&-yrj>p3#Pnt>(D+EW{vurEi_>2tOJ?aXn;GlmOcw06hBRiO-9WDcbDlb~R>
zO;kjD6+r{{!lpC!FQdgxw^oT&KR+I1*v*GF^+~MU(?65-XZR6aY7@iVdW~%dBs(Kj
zEW>&sG&Uz<G^oc-7)grqgX^au#qUQa?JN>Y>7}WiKYqVC`27;rUh_ISRKuiOX@XPK
zVIx&4bvJjRv?9Nj?)e3IX?R80<J-#FgydN9lR3N{f-fJh^X<z%c^tb;SI3K&A081A
z5h^MwK0dG25*Iyv37{TP@dSmRx|cAOm3bMK_{JJ055FH1BK;1m;P<#Tt8RNUOGP5K
zJ);v=5oOl=zD|e+uY+@1J8dE)e#uHqr|k{Ss-Ek09|}^P;4_KaRZ`PXz{6V}N2;ix
zn+#ERH{^$k6>oz|bu28bt2a0Tgl8lH0~%*_NJz^~)`Ter{O*6%nFHf;@XKl@4F;~m
z@4r+-@qG&uZELj(<>+DSA_=I4SYhp`8ydwVaf^ojiGrKdn#Rnl{ajwa-ulLwo5i+g
zKy&5B5r5s7Uc|Dt;uXyC-}X-q?HLaz-362Vh61|nTNh<l?99w*r4`r5oP`Tz?ZK`q
zNB62#wP#+6n-O8cSlFslyM=TuEjvGYN;0VPq~Cr7J0wt)>~lKb-PTY704jBxT`H@p
zsti%P@+^nm2)Lq(s4&tOly>%>UO67OC5h#hjcqciiR51SJ$5S;b`E~wh?B`5YCXR9
z(!+}im<kym8`G<vx3jZDd@E7~PxozX&w(tef8#79BxH5U-=;qEN9DNb@91cBOH0e!
zOMrvpFzYP+Hp4i4(8#-wRVe()<5bXa(C6axWbxC>*9ScsK?)D~m36*Cp=FTM9M$Y3
z9!z=i02!Jjyqzw4;<+V2*^7&dR|DyAJ?P3LdIx4AQUibBVF&iCv&^MV;BBupF?Sw|
z>S*%x^h}(lHPu;rTLgOCiMA4_i&04Pg#~q($dXU^T{GGoMQ&&G*QM`_X1tsaF{`So
zf*F+i`}?e{EI=0u+O9);uo(@;u6%z$VnSjy8JU`#92y+_(&_<*92yc5)FO!Z6%`eQ
zR%gmIISqP$1_cEHolsY=N!<Pr%rWCaMthCPmz?$|aSyFk7FJe(=zqW5RZvlx4wwvT
zj%jRc1j6gLu&^S|M;29%x2InP1qDBUR=^1vFcdyvq@WPheu`18^t(O9IlpU#BS1C*
ziV2|67846gF8dUUZDwZHu~O&UKoG}f^)ybua!<&WJAJM~|ItH_Y-NDnd9!XJ*dt0+
zniYW{qK}@D(O=rPyxdA&e(bl|Boq{sYN`5b*;=oc?_ORnKqD*s8|aQowVUFHjaM4%
zVHZ5+Mn*;gY8Mt3?!og~^#I!n+$A|_Y2Ryta)b;`I!vz3P$6%NS8zyC60r>BuLLFL
zezcv0A_}?_5~l6$n&RW9F!v)F=I7>$Dk#{FWkSXXges6r(S-HL1q7BSCeD@6HR^3h
zGe%c{r2K*GM}RPfJy5>`_KpKFA~Y0CgG~Kxaz5C}F4+O0gm%RZlWro}tTpd<>JbCd
zCuuYOFgbTr!j+JO3E?k{!wfdKxVU47_6<wFe*FS!5#3e~4Nc7lP<D+O6Y;sA8X^mm
zEX-9JsFi(z2|J2s)I>x?1VkW=_Ay`!>zkUSx2*%WF&EbU{E?YM>pr%Z_J<1eK%=p-
zj$~l<I-L9H?*0I#R5~lhf%smnLT8McJZ9kLWF=6S-Cy{h@f=a#+pVpwX#b;*NU-Pu
zIm7OHZs_UxR9IM8Z89pXTLy>$9UUFi>deen-=_gf88D(DD?4(!)<vmv`*6}GB6xCt
zbx>Yj4i68XX$yAJ_V%_7$!VvbpC1t8Kiu9*RH<occ|6|Qnv!JRC9;`ChlIcr7@e*6
zL~=@?B3l|7LTm8YtiEzUA`R>cK#GZq`ugELARNH74F2Znc#b%@mx1NE6c8FhHl9f3
z7`I3sZ*q?J=${_vLeX+j!U?`bQ|)ONv2#m39iPbj8Yn1>G5+%BdA$sObYW2c`h_;b
zUAWag*E~k3^Fgkk%2!g*BcSdXqw0=0%O}1=*HpsS$H#ai^&Jwq!{02N7V8xO9GnCo
zzwRt}ettF?N!4g^V|Q4{T5L8RP6qeyXr*msVBlS2$NBmBSb0D|fbc`J{f_M1**`b_
z=KC^{*F9|$+fvC@#<VUX{1U#<IF)QFT$|=Ewq4_=vp(Wg^HYPj{5~rb4sE>=#3+R9
zksoedyGCoWvpc)HH^G3Eo^pD6I+ze#wHd=lc=ryM)%a&=DZ^tk4ud)>0Rf^d0U=@5
zJFrw<p&+23c->#pe(Qn&oxl^&V){@W|7}W;%JcEY#lr(n5-T>mZv#S-%Wb#WUtCOQ
zBOom;t*g6YzcbPyJ_h55IGVyKcdmaMNh%T~sy~{}PpV<6B%99nj?4wH&y9^dnwpv_
zDk}Z2Fo{`E8hrX~Ole5CvJF4ziR4PJ22p1Z5q=ut9rUA~T9PeG+RQf2yBZmAS-nL4
z)~dU2oo8C~<ffBdpizUPqcSuhfsv|=sI1c#tsy_`nX0m5@=5T#nQZQJ%h~TgokM5S
zHxr2+<;C0CpJM#P$?2812A&CZM<e1iM!3(LqX)2{7ihTg2gHq8L*F<5kd^XN{mcrN
z3eA);x@MEryJ=*YYysKfptj*9jzc;rE~OLa##X^i1C38!QPJ(l$Lm}kWb3ZK(M4AK
zc<|9U+!C`f#a8(rrVIxr-3#dln$syMYJM|S1|1od+w8jbAfl8}Av<$bt*b!(l381_
zH=0WMm)%GcA{l9iQ5+0nV8ADeXO~C_chZQfSXj=dNKUiCR|d|{%GEuroDsLaxdBNE
zL{p%N6>-LKYa4rH`tlo&XLGE!ue)1lMF5a*$)BB_Z@^Qfq|5_>^!oTOKF@>$kLO@4
z?N9=9pfR!A2q5-+<6o+!z?u?RJsd7tbxWSy$zW8+%`D404JQs2qCtg&v!?37Ob;(|
zOI9i<k-*)_EvAy=$I&z^hEz|-S@)P+;rK$`1XwQCLWrfS=4f6mzrowt+EUTfw6L^9
zYM~FlIRvvbjk*H^1Bh?mQ0(=F;YY79)}wjgp`yOZzZzn0Yv-ts5{=e3Flcf-a5`D$
z-yMb2`}%eP8640sD66Y;nT!zZPM2vlprgZf0l}Ov5z!%B*{g1?D#HOl{icU_=Q3&4
z3!$!rys-t+;k=}zdFK^R*uUbZ(mpOmM*V!<en+vfm_UyOi$b)!w+HzR)_)foDM2($
z(i>X8BOok(0dm$cWA>z5oe=2oM2%h|#}5w+YzBD#OJ<G}o$1NGO*CfogU$R1<2ip8
z9P2{l#8uN%zT%mZyQv&Ncnr=fQ=_iw;v63ri)z&Jaq$a-@_V1ogay<{Zs*k&?d{E%
z<OuN{u7cPZ%Kz?9WjwM4E|w^NbhWTb=)AIvPw36>xIZ=finQn@ZtRTFSdQ+fCOTjF
zPE4$L>g!jHgnYcv9VyEiwv_fKoZqzG<YHUXZ;6M(wDeSzhmV_ij1E3LJriQ>WlKgU
zr$%<3LieuJg<AJ0Q!<?xQtm6D%Js#L@}~$E1d4ql<*og3<r1D`3U35u)-SlnKfLG7
zG1yX9K>ecskdH1(Z72(~_}y}_eC-}mD@Ed3ViiyDUA^_Y-*R(04=Q7g-(P!+w+^!X
z6lA7GQ^KJWU!JEh8X|UT%PZ<XzGStdb(f%Z8bRH!jlua=Lg4yS4WmTx!rpqK4Cn{o
z;o-?pDS-&jf`{075LhJDTRFoFG$5JnEfHB?H84_Dp31PCKE=xI<`<JlyV|2PwXRWq
zGW?L4Q)N_WT0CcoBJIRam$6%lm!_$vN?nio&4aA*{BWq4d?QR@b%Bi-t==C2$JoR~
zx|g6*onfr>>R{Ga+55Y9OkQ4|WG`-RL4imh;-jRd=HlGk+|(52;Sq>p&}YBi-UbP0
z6_%8UM7(-tq@^AB^XF$o1Z>*i=x73i_u<|i_t#G7-O3JyOzs~JgF{240&MW$8{^~S
zM>7Ni4N0jecx_hOeSDy}L=6oo`pGEZqMbn!>c>jd0D{kmJ_G~=@*v`1AFAiOos8&p
ziMQ<{N>DKKb=Dav_>YP2^ui)rK<4SXj-i$dKqgF1PNo}B;h&(ek!IeBd6#SCS%aNM
z59h8}*{^1h9kUSH%;V^K9;&7Ko!vPoO;PPlqdZBfJ%^lll-<42bcN?{*tx<^IetC0
z<Ou&xVQU-aj^m*0^lf<TWII0Zc6K($oe%s<j!EB}SVUm=v_BUr#FKxvQ^|=864dS&
zylvix)g2ySDlRZ6o*bITH6nO}wQ%n}MiUbsxYO2O#EbupRL3=2CSB^XRvGM&u34>*
zbDxh6q#rW8p=Pxr#g0c07Zq$>)V9p}?3t;QUydsX*NeaN6<2A(HiVu{*L+)p#!+E)
z#ATA>y0&pb!MI<klP;IzL_<g&(>GebCuw=jfp;3JX<uzqRg%n;HfVV9=UGPTcA)rY
z2fibT`wiS1i85b0qktwc+!(<Jhwm<mNCZa7Dfbm>p}TB~F8Ws_yju-A*)ToS1S9cT
zsLo-i?Cf9V5RR@k8}VrBtt1{x+QRGn6)jp8p6Z~eUpEeddT|j{ADr}9@Jmf3t^GzL
zd`S;_@!Ux=d;}T-zeA-M#ch~s7i4^$7=yE@#P64tax#-SPaK{%4i`@N*sxtJe)1%4
zSTVHC6^Q7lgt?o3xztC1!J9WQFrPE^9pRa9#z2NSMiP8H6;V}<&z+@2t4G4;0HQls
z<svX?MZ#l^KwvsksE{*~&W|AI0uteFjN{&f(cSqL7!lU9ra*spZ$5?<8XkUqcJ`V#
zR)!Gxe2{_4X|wKPOmx+1rIp`wEOT&hkY26)Pb#<l{pH^A@i8cbPggro`<9mYxVYZ`
zIRTPk(jb_TfdMlx0vVl4@+Q+WG7pRh2}?u6`U|0okFgIB)`L<+*yr85cb~-;q@vp2
zxm(G;%*#!VO3_|go#}E)siBM!&U`YvA84NAyB5XSD(30%Ho;QMPx@jWJd;C#K5(2x
zo2d(_Ev_Lb@9X{Ts)L8+_-IFE?bl*%az=RDL_000OXtQlOHiIrYZ2|PNpo4H$c3g&
zDNad*YhAx*HNUS(X#wTvrb)L?rvN89n99n18Wwue9tC^!*`7UqRyC6<?w1gfyJm3y
zJnwDk<x1lIMMp2Awi@v(m~z)u1b<^Mish$e8S&sMzGl<WjSx$bt3U?@C#{%L265cA
z%vjY<?z<|wYglQXFoyK7*uo~Z=j$7)nT5;fFX2bUxxNNS8O9$QgoT4bi(7ZUe>hWD
zo-Y^2sx$tQA2+j5DvNBZ<j9(D-Ke%d2l+mP+LFP9eGGP`QDSQbR+_B;TghH8@#u`2
zPLFYbX!HqgB-~6h7Or6{+r!S&apMQ4r<T|2B&v2I<6BOAp})|CBlx{I_Ge=6(pr@c
zv)&o9<k7bI$x3^UhMXwFq>k$DnV}j!zui>?p6!<v)p?(e^1TNA^KMDatw%>0X76ZA
zO|OHJx9n|*MGr<}k=WL3QGX;SGTPtx6gRR(nXqdSX6>RBf`67?COfDtU#pM>9l<u6
zfLfPC2k~uJXQw{UbY_0`>y42AEF&W{^3T|y=fgFa7RP)49vT{2$Z+7pIkjB61d)Z$
zk*mAAS4ewYT-@HlfiK$qL+JNxpx$u@K~7W@TC}{ZjQ{&17j!2Ygl3HyZYh>liNx?^
zxdr;{Vg?M)f;rp{r|;ie0|Q}ADR(o&!iITatZt6~f=Q7U#{<<EK8dh)mD#f<Ybk#V
z9UM<L1@@+g?ccGwrE5Ql?LQ2U3w>{`+r^u4Rtu@{Ul+DMyFeygF%rT14W;kB!Cm%Q
zAur|)FFp43xooepza0BsVa|s=2{9AG27iHtp6PF+jAl2+DQ2|&n5=W*7K&usn*EXQ
zj`7!M(w6z)$Vd5-K2xDlKym7OxfxphH5JFIkF{#hZKRTS&h>S(`)Dt(uDXPx%Z`2_
z_!cZOn!<G+O9{^_-@Fh4c8)eD8+Fw|=Iqyp+Oxf7aAQZalD{X)2Z$>;P$mwnbD$Ts
z1lqc+gPpsf)CToZ$|d%#gkhhg=!|uLmV`fyOG(D>qa`=1Pi@KjVhlKIx{X-d97n~3
z;4<MW(!Er9s>pLvFwpOrG;!kCDV`h^dGbm@htC-~^7%{Q=@(F7!p$zxw(v}MPphx#
zxDBUH-YFpP*3Jw*Vnf{=LA>3mV<NT^i^t<{_s(Fn62_(uCO;1)J$k<w3NuT5*11-8
zm|x(fM6+jP?p1mAEV*4~P?&V!$zPu>`P7OHUkOvnbYFD<yC16i#NZm=wKVJ2MEP%%
zkuK;ZU_MotnD~C+w>c;sONxt=L3apxw7PHe=Qk@}zK-k!-6Tn3n_ygFW@e_Nqa#xT
zG=ZaogLqsvt3O`9K}14AA>@I^2l2`;77UpRjl{;rLPJ5oaAZDE8hhUJ=5v4~JU%gD
z=j<HFWpf#d!+gA0SK;;Y3~D#6dYiRBNvy!!BlqnZh*cyeV4g^;)#DB{;FP|SEzQlq
zJY!W|FB>%FBsO|Us7OvXS6BXU^4O?cw>Y(=Z`kmTj?hjN$!#bR%PmatwBi|lZ&nSe
z3oK43C8(nX$Jz$61UM)VnbJ4vjPRy)ts~T;U{qAoV@@Fu`1Uv=f1VeZY9G?V&A$8g
zIeE2|X9hA2ilrh+vG^}`{wjx=Ex2;qq1~Ya+m4zUZZ6+oKYt3D-M;*tH)CBjCb0_T
zTmXVz>QNEW?6{Gd-3I(z0TsSUY*lkzo(a>z+(gaQYx&h=7=0;Q?TBHsQ|%!|+R-Xp
z@*9%^Mwk^c?y^upwi)@y)Yv6waVa^3dz98}@l>tWOr6ybi@BMSXagJ#DXZo^CQnz}
z;-Vs+X=6vJUPXF5|5SBl&TILR3m6-+vq5v$crU7a=1N({t)FulX^q|6!x6th;r2(Z
z+Zw<4#H0-BUARDEIhFV_%M=Y+j``Bpxt`jZ=W<YCMYY2mkATFVxRcj|b4mfFraDCI
zPV+D$!|IB+CG)LkKcy#P=S!NKM&+~9Sr>P9{(!N{d2dH!YkV<*0POwr!Q1_NjS@?D
zbq8vHTm5Xa11qxDSD5vjr*|Im89=Ccdw18-21K5j;sCh<CK{ws5}@BiLP05zr&%?W
zm6i4D1=fD*r9f%!aI*Y&agmpu{c3kS2Lz_mlT3&&-1@^;SY(1kPHUomj7?GohCj>8
zIxIMn80Hoh&PNL{y-xd6w5Xz|j3zs?2;KnxxC5XRHa0d$v;emO<1`-#a2!w-rKO~x
z>fCqO$Uak@!&Js#dXuE@S^)v0=l!KwG;9%Cy%ZHnc|sr6d#9N!PR#mB*h%-qi2~Q4
z@~G^Cq?+}A;{R6>Z(pw`=Uk*;3d!??OpZRs_)6RE7V^P<mS{$5`NH%<48_~G--dz`
zE%GU(8B2|M%cWgeMukZ*U91OOBHE+kbw)<%b_m{EATEZEnCz(7Mf~Ip9w;Gn=Gxg>
zmrK5`@CUdOc2ENrLtWV3ncx??K}CG{)UISdBwq&#ZJ4gD%EiF<mj?`ZiAzGOq7f&<
zY7~d>jmL)IJ){d8D6Yo|s)x?lBdQa{`%;7?6hD@nj9fY>_-xku;N5)QQ}J?iOCKh1
zCE4+W;@sB@h+?r+A$j~0?0cP`XfZy6xNE~+c|JSjFSo5(2O<u<_XSZ@%ALIK&P4!@
z2mY~G^X|L}BD@v`6~V|}DS7prTHP};0o+oe?>whTGF`vCTwAHSeF>L-`SieU$XHVk
z%C-<a-3P67m#-m`IG#MI?b@>x%%|gc2ZgQ2P7q-<lTnGB88=n0k2E$&a7ajDKYzw9
zWky@I!z3_iyMb<zC247sya<i!wyK4*fbned*eEYn=gUs*$Ffyt3-n-~I|*Z%uYWhK
z%#-3$t1GA1CO^we8+~G>M}T(nDtKH_b2r5q-|Tf(G;ifi;_QSpG{<|fCfT~@&)<^I
zcO9uvH3VS~yz;KriDR3SZD^u>TCIkEW3=yGDON=3NR%=W`n-Edg|JE6#{yY(Lc-A|
zzbKjWbT)QLrro7ea28-#<X3dKFj(}BUZnl$MP1X|1Z52#9+8X3xR77dN(x@_0RKqo
zyM+{$OV~oJ0Y)}&S@MYAYOd*;?$lwpHDkizba?Ma*3PEC%et_P$~-<(p3Q}x=OrMt
zUC2i;8h+ZEhOku&Mk9wEts0YkDaC*uar?9mr8~^zRt37tdrdHrH@EgxY&!DSVds|9
z71U1U#|AHxo(xguH)k;+b1lxfpy3iDLv!MzYvP>zBPISEc;Sl3;dSs!)xGFJ*t_Y9
zW}<F(mpe4y%&hS2uI%6w4P6|1Myl=U*uWHy+Ml0@%Q$6E?+9$4()78cox?wrdcl^*
zlD<cezcVf8zD$h}kE>|bxwNaix=3Bf`Z_OoJppH{^024BR%Y1CMLoT%rTPu>H%-GB
zo+Y+2kR^?$e-th3ubH;)lbSEo><<tMbiJM&!TvKhFLPDaV|`U@Zf*{Wl-p}@5XMi#
zn43~jyb@q{RahDx(&I*tvmj&&RD{F7XJaU1g7RvbNu363-70pX{U;4ZH_nOj&!h2)
zer?2*E`(MTn<;m4iK`f}FU$lQbsfglbYq}>RB!e&C?-8sr4xyfP2bHerVyt!u|wf2
zNj=CrGY;ZCQ}n>n3VCQv+B>@9v*G)0Ee*qL|LH?>6zs$k0~S@&_lZF>Mre2~dkCH*
z$>ivluhG>!bC1Eh#SJ>iW6({3R1DYSQVhly&_1?*r+!Z9jB@qkz|E{MrR`?t-x(9t
zTLi1vw9*blf~OmkgU0!nIb)}YH2pb5esDECq9h=^J54#ilE*P56fa41uuLx_ybN8i
z#>?|B&uQtBmQwHETX0opFn@B!RZs=GWe0Q|CVh4rNzJBTrKN9AaKN)u3CvsUA__z>
zCx0xxf{;Z`sl*ZME|uimE&JmxX0^BU8;tNIIeXRO1`91+)jMc$s!)l$@Z{V1#NdTV
z$<pWxNSFi!1)O3;w9?4YtTUXCE60)&dn(|@s1=k85lyS~2z?6Z*}7SZu64!?7q(pV
zn9#o#{`fgxEJnbEZ}}nKHIB+V=Fb~v(m4qc?`z{3_w&eKmuuI9<c;P*FP2r^oq;<=
z)-03!MQ0A8SKcS7lrKln?;=~!9#|`6I90k_S6gE_C~dAT+BJekV`ixh3)<FYW!S35
zRSGkp%R88eb%L{8<s{ZZwTw^CnsE@m%&ILVGGlW6P#F3)tY(cKC7O`%B;<EZZ9^8|
zXB2jlk8W5pp*N4N=u3%>_ZA&55QD<x>a1`OtN~eBws;<Q^2~P9&rxLF<Pxg-HXkB6
zXA3h9!d>=_{A=Q6=y$`=GE8Q%r%x%E`D*k*?v6=|5WTU*r{Oi5?OGS&HA-`?=sRUZ
zTb9w6rxlem(+Y;bn`S`W6lXG3r!KZtpW|f2nCPk|nNJ+rPzG=y!!J>jYD0;e{<<{m
zLKQx^{aBZt>H$*Z_SO+b^pv^Iu-Owe1-#Y+di^#RwdRY)QxCYhFz$=n>DV&rJc-rF
z3TPN)Gycr>8cuH!fmiSZ6N#*HI-lo#paQ3Sp#pys$*;<<M!JDd??3+tOf$cr(Adz}
zNbrM?!GDL#U@^HcxpIlX2SnU|56`>-V~PZdDBug<|9AM{H4HKgauF;zAL4(9uYN!U
z{?Ccn-u`#kO~_lsTVx-c2oIe9-^1YQvf4?n{yYCahyQO~7M#}UjFR(nqPa!x!wB3J
zl(UO$odQ3D{Aw<tqOD5J-lauN>I}UN6ZZa%+I=t#ve(80(zJ!7i>Mtg{L(`t+){VC
zSX0|UI284fe5Vy9jJ3NrRABS=W=_He2hoeI&FJ^PJZ3b5lSV}>%2m;+TAt9s?I3&z
z0V_2Gc4W1&2@SDlk$wvOcfKu8Qf`if2j(T1^(6|lVtALho}an2NbghFL@4^@B^S%{
zO%kTAkM+Z$Ze)eP`)Z_{5(|GGFK2~^-Q?!s;>BHls613O%tB8KfeH+g=%-DtIx_T-
z{RpgC?$J+S+{~pu;%^Vi(?-KlPS+5clx>~EXthR$UC$uD3d|W&q_Z%uU_v;aoUY2N
zo(#%tyVebr%u=AK`9}2Inny$yC6th5<HD!Qdl4!q_igXcLrFkk^$^D|A3Ij$U|7`M
z0#+y?K#Hn!n%i~~3X@9|I&esCgjtf2fw8=}tjIvAa8u*iiUM8o+{}!<>#EO^fT=<N
z+6+bc!@lgx2V|yH#<VE0pUXCzPT$--PoT_Del_qh<4+n~h}SuYwbEwL*yu<*<zmMw
z&~C<$FrSB^!a>`yOo@AW7L-d^?=?RLX124V$cK%^qDqY6iY=gdc*{e;<<ylUyonj!
zLV$K*s-@2)rH+v}XPJs0>F3<SfC=x=NBAU{5_Rn(@1dX?8$&8}H5(2UNCM~cTN3;>
zR8cyxAW@+q3lv~|yMwg~L4`Jp0W0zd7n2Jbf|~moY?$9B@~hx)B01iH<(>Y&`47kV
zXt=nNWKoTPESbP{MM@X~W$n%Bsl1Ac<zDme_rTCf9tE(8jA>}lkXC50$M`|=`_l|&
zG0DjE!O&iJue3rqKBugtq#0=3Kd--Bj0zI+Iyu<eTf7GdJ^0$Ky}fzh;<G)xjPMb(
zU7tRELPkagb3JyBj+T2o4_rk>MW7Ye1`X5$*&*PT#>U1L7awvY&s>^;mlh%(TYrDQ
zDWGF)p1WEZwFUmy%kcB}F9T4#fIzDx*A(lSc1b8N4-YaS&t*ZzcXM<rGqbxsVy~a?
zKW4jD*8z$uQsS44k2;1Vg`%|D9JMWgoW6Die0=}&=97)B?Q3^UFtjcnvwm?0*xkNx
zLf}njwx>@Dcdo)VtA7ne>McbzIBv?QO#*#~2qJ0{V41BKYmb045vc78%gWL|d=&<$
z9qA+J16f#DUcY|*OIum_H|P&1CME!j2e>Bd_8+eSq#t<t^C#k5Xx|1Bv4F?<=764~
zFDy#Hsh=M-Dl%2fJm8vjTHGKKfNTa(m{>T0>-FJ0u+{@|w9CW!3r`Iw1zv!&f%Aj5
zay8Q?2=t%7)EOeVQAG&}2_>|or4dH_5OA0hQ&L7#xmo%731u*~s3|DAr>1^O`T*X$
z(dCS+e+4uqLPDg_=e2;|1T&Yyim0ZRmW;u!SNkeljmgOiZ7<IVfr+dpvPMRfn*`^8
zQ5OX$pycnVsVTsL3X@2v1C881io^5c9T<({<>D$VE(VP<JMfZ3M;3Q|e&mK0X7mn&
zz#YQnJsj1oEUUc>4h}w@1}2k*n*D14;^yb)XVz|n&O2Ud6I4cs#S!4K+mw)$8~{Mg
zs-|y<+xg}@WrlhK=?qmkpJrb<Os)z_UPXk#Ld^3AvMJkopny3^6sXTb@FU514X?#&
zzux_>{_)}d-sh&-?FwFohK^23K>;ln^63){!)wIe`T2RUl)AdenV67eOox-%SG->M
zyAQ{60#j1(3q=eKHv!xYp=v*BSMCibM4*DjgO0`|Am9V?<J43F5fKr2Y#S1ZQ_#@^
zl&HF{&dVg;cr+ae4bVog=1>DZ5|GXFb31^lqcTua7i4CV$n(gHNMMK-0}IW_uNW9H
zKx44KZ{CksXyR#}sY|a~3b<hOMBGhS78D{4Rn@m!{@SZ2;G&SQX@U_o{#KiyxKR70
z3V7lfqDj3T7b?*CN38MV-di7t&&S%^Uv=Y<RcO{BU&G)7k2rc|<IkUwf-u#YWAQp6
zu0W~|4+~QP)8b1eWORRf`y;YNZgn;EM<WcopGa3LfIix6Hv*cP9>C~y7N(7SKT0f~
z7ScXyD;yU=u2{h&V8kJP9no1KU+(`1qvrk-4zCe}l@JDlDHI9i#x}{xyh-FJE4h}U
zh@2iwI4>|zpzq`$-WJB;guHKOpeAxEDn|7_0MKJ3L-1sMoyG0a6qW0jH?Xz4O^J;a
zr9cM=O?x)5ZGj4BH|RwigZqH)bE~Q~Tds{lz=it$J>32)shgb)FJTuKms`Jg80`i^
zi4KmAYq~&a0e3QyhY<7^uyD&GD57uKqkgLW@B$G9kB|@v{mlZbyLWQCH=r!wxf{Me
z@jxDBbtEJt0Mb`m>ttp|8-%kf`&3p1*j)>Y1Aqqk3Ziver$3!`p@0@&UeP88m`_IF
zvgWUi;{e~wgBiwCQh0x;k*OwU>qriXu)Ym%2I1Eoc%-Oy^}x?8C?ho$u>mTNm4hPy
z%ICyq4su&%G$cd-mVv#!{n6oJrrVkg>M^A;0M3cA>p{RdyYu)U;iJAyKtPb0o+S*%
zr{99M{^l6)fPkGY#2(WKdh>k05h3^tWG%mt_7UUXiCgcnB0wVZf$2Us(?V>bnsEuE
zV5gM-eEV7uG+C0w9IjHl?l9i;Uyj3vwf|p)^6lgM{`kJu!rWX)90bPA<6|9W8{k6}
zosh87<Xk}^a0|;d*z9`okD0{D2?`7KJ7)(6twsl04`?5ly&uCrd~CP~<V)t})aPDB
zNqzo2JvHUm52OxyvL~4J;G(@@u(7ZtkFyDZ=f}hK5eP2kG&ZY2nex#%{q5gK0yCw4
zY3t}%SzCLpb;1PEN;2~Ew*W`CQ#Cm`lxHx2`RX+cIrO5hjePrkU;%o&QAtS&1pX^8
zBO_8cjBrtD>7Ok-$Of3&gn}~pM<x-?r}eh|j}U4Whs@{Cq*b{;^t|PTxR$^Z`-)@3
zUidmv7*(Kf4iFh0>Z?K{*vvYJ;nJvr@&SClc*2iv-fuFFXUg#5N^;{#pr(2O>#yey
zc!}tbl#HInq;M0laya!sgY+?Ny6bHUntJn#mk_i+xdVc<i$?eR$u3Y<nrVWh&q`tg
zL`Fa@KpwQB{Km)|m6Nm1&D$4=o4X+bJ{8czU;(V=0CQvCx8AK&)6-D{h8!tKwXrEF
z2%WKsi8N@mKwnX5(AO%r+CD1OPhoDi+Wz`i9*7Q9P$pJZOs+QXirIi*VSQbHj)|WB
zjdv69lWb`>GBj+rh4d_AbIG>@kaB9JB<2-2EP|gedN&m8o}rOZQAvsKcZf_OlCG|<
zfq}u??XPz4I>_ra8?dSHKk7p#mKff`6k>&mDJd&oUSA*2SMOA`J>QQoZz(||(-E+#
zK_IW!g)B?88WdT4C@3iWBq*}mNhRnvtD&u2#A`&nsT>);dvZkyWk$pOMW*VD1BHnt
z3TD0Wb`DgX)6tg?Z*7<d$d!R`570XQ<SY{SOs-Wq=t^pDIgr=|KYXz1k0b?@P;`nq
zpxgni_*M?!ft9tj&zRmrBO`A;U%!s#6HXKG<hh;M=!<BRSxuConi~b;8cI8*7dWhQ
z@T)Ewpz4f<;(<)aw{u~B-g3D)X>1j(I4g@(Vl7J~kb*gbQMZ+#7?}-|0J-XZcbr@@
zetR^dA}i}P*6zl}#{NE?>)0blBplodurbi%Wnvl`A6HN$rLlpV$OpC50xONp@J~ZL
zv|rHm`@R(3e8A?W7%&1?zVD@^rGc3Nq&YP;wOtKFH592jy$&dwAfS2@COJgM$9D(H
zp*h=s_${!NPIh)+i=9tbA#=|E5E@2eVq&Ltz)Z(RKC%Fq=0M0!{!D;E0n$lAd)fo?
zCkThFE@#<mByWGwa1|Pc^Kc!vd%uB9=`{fn7MT|b=T#Ae?KF3FRZpIn2HBK22)WyD
zisX!n70Z{|2L@;a+Mp{_A!tHK?Ck6s4A>GAVEsK@9l&yMtN--&e%)^cT&_62%?Y8l
z-MpRPF#hxYt?wCN`#@qVl!Ma+6Vw!NWC*Q}K;t&XBF||f)sNLpEbvJ9`2!dI6yp@B
z3u!Fjx$Dibww&DPJp}RW><qZX1l%hqD*D{=@bH{cf*LTKB)uJ_RVvt*3~r(r^jH7h
zOPQ`<kNI!c*GG$ih`3P6FdS6m<dbb)UO_Pl33Vo;M3K3mH1aJ;N=lLhsAkV=O5iGp
z`h2=3a*946+6oSfhZDTs+W%+U!Uu~&O~2B3%^CyO2dW1UTeY;bmXj!LFi#`;`z0dZ
zgA@xiTeM2~VADaCfdOS+NeKd8Zh1KqE=p8Xlr1Q=fc6fp`)1MUaPD=|>nyWGlFcLF
zCt}EgdnwI=@&VjZq1xo`X1gtT0tpse_)X&VKR_X*qm$mZA-U}CTZPfhxs&-ro~A!5
zH-zFxp|=@#XoJUHm0=auE00Sc_L9$#$0w<PKI=pW#a_TdBp<cs&JEOkNns(}#9K+o
zNXHLmt@>9xUV3^Y>g;nAvB^~FaQ`?k3J3^9$9DW$W9~NlpGf~-&A<1$d4GF2<;wn|
zlT*52Qbbs|{mT~~i<NLtBm+YJo;#)^>wpmhf=c@a1gFu=c1aYVZq(4wFg!mPpj%7H
zlcxbfDkKxz7^`|MD+dR*h`tgWt~m~N_PA!+Ad;Y1&^Z1d?I-^4b}Z07kv&+yTs>HV
z3rR%K5{f=U(J6Lq`$Y6l(kQbOfhLjzEW7{ur*>wXj2BPd-Xh=Lu#<c5r{4}vt&x7&
zQf-zACG$&1UHoFaiFFQ63va>J_aTu;5ygX%tv|E!Fk>9*6P{fZK2dT?^y1JLagh->
z818rVwqrAw-g!X?W@d0+w#ka`ndi*kfi?+^ACX;r^<SeV0Ss|3z4((e)M^cXEfT*D
z&p`KDYv?;D@0-Ij7f9mGWJJ*Nf9OS+Cb7wJ4|ne5waI{nC-5PGo1nS_c9%R<<SYeK
ze?=Gj6i-$srylY0iyGH|F;Uqe5ou6isVz(VVJ<3*DNsm7WIs5`*ixH5_KX%qMc;8>
zT#{z|Nzd>yl2hIMhd13ZKTet0BdY2NqEG9)Wm$e;k+)+}bn>n69kKVkR`BR&#xk<#
zAY*g*mNLYt-Kyx8eVF6D=GRrq=4d%q)}my-T?ZC_vjUTlMP1@m&OhY*stV6x5Wn~^
z9Yq00fuly`mYy%0khC^_=4Pkkpu_DidRlip_DT0Chy8`D?((ac0<JF%Oy#W7O*I<{
zn~U|)SGTq5)QMsqWQt?`(yb{-^>fbQVDdrm-JOceYwt0L|MHOVto-MwQzCB+c$kEB
zyLEVa4G-`1DguHYiO*S-vcFZ~|G(SB$#&RVBpxg~^A?kZmW-SDkB<<kt0w*<@3@*a
zgW^}Y=<Re@Ep2V_Ohq^k0dE<ilW(oghA8k!j}+B&j!(E;3tTuQz7gnfhKg$;whK~r
z*2G(nC8e3e35D=1zi>i_Fj%+`#IF-~EK}5#+4tMab`Q~^P=3QAf39U8gTvq0GtGpU
zlGrF9_`-=!M0%BHgi6%bj3Hz!i>oC)?_BM!bn4n1)JJ}uOQVxc+c9e58GGl%hY`c=
zvS?()?k_O&mYPiI-M1i-arqow;NxaqdgXn;<iY&?tfzp1Bk<J>)hdR>Chc*2Q{W8=
zdN5zB-gQzCe?)XW0|ys9O+c#8VQ0G2P>(Z4h9MTp-Hx%2Mhy+?D_zsvA`kQIh@7nL
zMJTlnLpd(Hlg=R-A0Z(U^sEs_Wvp7ni}(5_OA|ZV762am7gg|J^~QI;9mz5&q97X|
zmGUZXax^!awpPnYlo_e|9^Q82D8p2~-*}CvOQTM_Ia8Mdu~Ip@#6VfEmYzkrNnDxa
zU@7?Bw~erB5K=WIgx|$<Z$7ET+j@blh31PJ1ttssY%C2XD`*c>acQcX7rm#YueWYI
zrC@A}@!pneIV4$F;WRL?4{5?P6aRxxB9bKmJ+iER(BqQ{vy*NK7g2SX`Q7mgF@|Fv
zzNBkQoz8?}{0hgPE{HBUM;pUtch|c4c9LG3cdZzXt`W+deV8!W9^KIJ)$g@a7z?X5
zh6;Vz^WED~n$a5-e<6|&x_8!Mwp*fg;ceG@cra!}O}ahn0*U<XJrefLq7zeH&P6P-
z^XX}PG7jaG+sE}!AwpCF8f^XZ5*F>{7C0Gsm(G=QEd62RnBlxG50ZAze#eA2VS7pP
z%4^sQ606v0URBrTu_-mr0YWB7Fbx|Ddu>{7#+kxiwR)N1*&fmvZe;+V`WL`C{|s|%
zYe)U&s{amBr5xpswBY{q%^{#9Nw#~bo$+Sfn6Rm$AnQ^)Lbi%Tlwy6L@Q--W9Y4d>
z$?!38oYH>l9aL!7iohR(M)AG~?7{s}uNVZn_#5Xh3kCF$ORlPZ`-@Mc5EnN)>t9T8
z4QbJ{?&9@7nCm%mteNROgWI_iO;HZ?&nJ6K?&;und^`z}MCBMo!UgB-1xLQ<GR{qZ
zahQ!1TDqX1{aE1OCYN&5)g%oaE=IL--^$#N-A!D7bKg*ixH9sV(-^v$(@Qw(X`%X~
zMh|wQIYuf&Y8ab8R92%;X-z!3cEC1x3wqftfgpa@>7FMkQ_9ea3v+nK8gdo;zImoC
zHxqSE_JWksCpnGQU&#%7I!g_SGG3ZF^hysD*eZ<y=@mF!=|ZN}`-Y0PTR%p`S{SZY
zR|a`T?wcyi54gms?|(Xcu-H?oPyx{HzwlXWQTrzf$GV9oL5m+0h68y!^cA9C_DcF*
zK5XttkE-cL#3$rIe6&c3pC`<e$)d&-G_RI#o(#^V{&Hxq+T87kJl;F9Jep=Ps6MBB
zQLkicw{_JWH|V2(H~)IE%ZyEGrfQ8x$^mYmywFzNy}Yy*nc`ccyLeRl-=FOAIkUf9
zRcgTd%WDsOv?wK!o_2bR5|ZHALLwzr&tWLCP0P6=vsz~e!++kiVkZ3cnq_yzwY#oT
z(9=Wpnrp?U&M3G#{^nvC@B*73+Z2_`xpTR3+i3?CDjk{kzJylJu2O5JkI2b(-saiN
zyrX&9=>WE|rDdb*vU9plpEwt*%7t%C>qp1>TL+G2%=KB()q1|x0b=S9^L@i2!*n4M
zRCwC#;NwERR-5Tp1*n_{@q9;Lkp9bhn~U&TRe}D;zT6csRx?FGjCHyT?@fE3xlHG`
z`URQz>o~`uv~!{cghA%rn5}YMC0LL;XSS2#=g`%&zcul3hn;`#`CERQ{`oU~@okN6
z-O3~3>PpEwZck^eJca&XK@?U6oR*UeMXhPeu7>?DPIK=of|=r@d2Pa}7T$bA^lCa6
zPGfTCn5%0FW`dmW>liNRiC4$OxK(m_kOjYL(6@E5W1(Po<sK)j|2w;B*uU5Ib4mfb
zC5F>sB}{2_@{tEWzzZ<RoadsxbVSCOf`c@T@u@d_C8F&cN4N+4V2H%1tD(TmpSAg0
z9g=;5C`!vkaq0k`sYxj$;i&Oi#udw6>PFU}G3kJIu&fEA>EXMF5Do@v{cR`w>=*(f
z$Ih7g$#}RiL2LVA#70A0O_s0rj?2Q7t;?!=90NxLwQ^KK*3j?iz2jt@|7C}pg#=_D
zj@!1jlwR7=<1TAf02~?R!1%j<wzZ3X`OC~uRn=5z$M;s%dsM;Axfb83J<rn1QJ9-}
zKH(=sZ%tPyXscGa1jjd@I435CiV0`aOG*u1RxRqxf3W(MHWRgMuCsp9FyHL-#!jZY
zI%(%m$LHGGed<CBHg+@TG`__zq4!@7PIF$*s%>1q!hNoF+s_nybF<u>mJGzU!}~%7
z%oz2l=g9+o#BhE-+Fks^Y?vBa=}!x+lCE2AC^kAX1qZq0&NuuLzu4s4P!COt@)(Mf
z2-V&(jQDFjvG1}Fr7#Uf4uB+B`dfuQv9z6^lPGr+b`ICK9g9_+qeM3TgF4s>pfYLq
zhH~=P4IbGwGH~Fi$4L`|=|Y%uMB4wlV#+*_V+dHGf0I$O_^RL5Pabyvm$sc{E#LQ>
zriOF1lM16H<-?3+Q85L}x8)g4mDquODEB+})4L2XpEUZA_YpamS(^MvP_2lk4I171
zcIGLqQkYJ;npsJ|zp)Gx{8E_7KRePUp(Y=FP?&daz}(c~L<Iu#^UkycRPg2)<PFoD
zo?qmxoF~Ea;Y!%4G1-c2Z*pZQtjdh7n^gm5ipHUd3A(^=TgdSBlEa>nab<jCc38MM
z$ZrYSVsUvoaL0cO3<41<HN2ZT%~)bffopB{KHlZ%RrTnkhb5x5$Lx(K6nJrDqjlpg
zm%!qjP_@?%GUd-kCFW|9(|tBCK?~o+$0`Bda_qR7b~N6?F^mwpBt+Fw;A-GS9I2za
z@%!0>+jq~j(5$!|dOBLEXVdoXnIk|g|Jpm2pCjEXRx7iR=0Gpg)j%vmJgEn4IfI$X
z(#xsKxV?8haWFk9s)TIEBl`tmzVEw1LjlTMOLXuyo(ns5?nb$;Q=PV>>t7tnYbW_R
zWNzDYmgD&e!ykh>y0kbmbJk`8ZEFau#2KeE+!s-PxjxBix-uS&;(x^1us<&`H#4lN
zi9cs^KCqR|#)8SVlYVm6GCG41?{&QaMwdDMQgr31j{COXAR62;)wYob4iVuV-ZrBf
zxh-<h(V@iy6h2A$z-QuuTPKYoeA<w<Y-c{t+r2{NBX_8WrPqb?pK`s&?^iu#Q$C7s
zLuK<E_A)st0>j$4@Dn!d15VoBd4CLjD-CV+n8n+dJ$e7dnG5Ie#+$%6e)iGnuBYW^
zr_VkQH~fIA+)>kUnwSH7oCjW#QscE#sp&R1mAEFPR*t$W!{g{uMb!EvEo4BQh$v1+
zCx?!nb$K9lcq4Gx9m~+b!!DW`!Z*+#fs_7eCk{yL`O|Q6Da4zSV%x5dhjy1(Z1j;}
zaOIT#X1LF5n$g!?y3Qglxg-fwpn4v)nhyd8)_?Oo8G1c8zL-<ple>ptzUdbhs3?hC
z1qoyXj-D+{e2bFL*S3j!Cu=tk>4mB5Ycs>FLEb%E_IDoKc>coXjoUmaLmTjZ&{?Pg
zlx=s)B&Z*_w6Y>45=fOKvfW+c90`bYUujs@SNtH!RT?<2NKizbsk-yX5{efNh^0Wg
z^>`iAM-KnaP8t60WvHdi;G;WLVAob?e4tuXOw^LN?;Bxc5>hu`m=jkh0Rn$f3Ywrm
z-5_VM$e;j+pA$`&^vx%7YL_E*_lPsAxS2tcpnv_G3yL&YBDSEjBW1bVz^<N;)LV>9
z>h00Y8_K{X;Ugj1%wqF>E=;;8*(@n!3RL){Z{?9f%6=ZLlGX7@d8G2eq758P69<cr
zuKP0;eO8DkR-7djO(k1>fwj#T|J^dhVMC2k9kqc%C6&kx1kqWR`d#18KPn`s;*6@e
z-r^0es$e-Tb;Q%@GT0a(FAJbYZGJ0yFti$*HR0X7^~K?)RJJ33**Jw(S}HYmI+>b?
zqa&j$aM&(_Yc&-;dKpIi2AG_T^d)I&g=6}OXY9>bWuiu6N=&$5(e>GanZqBCE8)^D
z7jz;tsQrWT4|AHb2S@A6<8Z|{)f4|O+eSd-HMA9|BID!<-VixJG11C9Dphh&?~$G#
zy`D&Py_4Qk5T}e(qxhuno#ONenX9H?W*t9&OF<TVsseW~=OF-@6J>-J&O!_p&iQV%
zKKw?G#xR)TwyYH49sKW?6?}^tKB+<Ydkf+OYLc44(rAfHo$RI_kc2zd+ppz0Qyekw
z_ljvfReAm=jW3FN;Ck@R8IRmo;cas<2kv=l%hXJ1f1g8-Q+dQXX@3WK%l-e%hOyOW
zQLtvqwG}KL%0HMKb-SH&=KgY1xR9E;Uw;3`-xpu6>iV>I*NpRA1@AAO{K5_O>yob0
z=kLE3Z`pROe(!?k*ZZAZCp>%lesg*LUZ1~}E4DuuUpX5%D+o;LE?pI`w*e>PFWu5u
zo3`pw>d7bTtAO*ypw^iOa8Ruy1UPjGW(VmPnC9rs^-`SJkt4T%*3rPplPmA9ovSr1
zqe>{%GU~66x{Tibw!>2w2R%7Dxg_NJ#;H$IqINl)PzR1xfDO2$TX2*o?ZZZP_v4kD
zEB+`Zn!gBn)o3{H6<7ZGFN!fmnxBH4%%-|rTW~r@@>SldbtU$D6CUnNXKHNF7g)zx
z^l|Hz+11ZBPM>*B!TRwuJ+V)P;Q3U6Ot)REd57QLSi|M^B4PVq|H;?y)Rh`8*}id>
zlWx2J@<6SbGCNMJ?s&h#)=Pg@*{6dyHTRyqGUb@;&cz;!bETi%d;0$JV%f9xAGY59
zsF8Sj?~WVt?f1NOf4p`(bM<8{hsCXJSAk0kU>>+@9vfkQVJdJ%m3@u=@;MhTJLTUt
z*skgRE;@Vl5wU6FAy;`fnMOT(f8Z(ij+%Woy{8^?FPm49u|{kEnfr!c{Bk;Gi^jHz
z#Yf!BzrIzxn{DN(i+7y#b0i-<yWzNbsdrlqG)`tsOlmy6dd=6{g-;K@`M2uRrvGkB
zb9YFW=ZD#I&3v=t-J1TL?_T}$JDGZQU-bGIu8*6|1KNLxKi-r!zbjzk^qqfe8#cZ9
zf4}ti(@URsG<9u0xHqx-%|q+EJXfAq-P~64a6&(%`R!pGq*EX`)Ah2yO^r!8!(2C8
z>DfDfJv-g8vytUv?#{!HtE5(H9olyP+iAzLW5%{F-6e%To0?YLieJBP!t%gvd|xN1
zZ2BMhsVn>9^rpW$TSGY>rK%tJ5XN7S_41ISJfz1uF>Oi217Hj9+>h@UAMa#-l6mE_
zzVY=tRk{Vh#)ypc`^%24e7E;*i<Ntr@G5V~+^{g&jARzY@?cQ732e^Tpfh#P%Xh2y
z`Uf3y`(9vhVZsv0qCbthWO73n6nl&F{p0}Wqz)71qfFBy!-RJ{|F!q#-;gIMen(T*
z>;qo<@JE3IIPO#CpuTu%-8aLZPZ@WwyZd`zd`{xeHNpMzXJ4*g?-XD1FEgm{>QRCH
zyyfw4D`phr>?sdtE@kW1R(Dnu_Ei3M_sgaWfBkBX^JSieRL7TkM7U)d72a(8d1;q}
zdb;x4&l@9W%N@BpLy%$NyxDrC*8)%Ktk+wU*js24|04OcFvqmGzCBYWXge#dxct<N
zzf8E}d+4-<FIS$}ywirUQNrrN&ATqkZ|~ZixY#E5>y@Zo87JH!17b@gSBhC2`gMEb
zO;8IqW@im!_wnE<3mq4@zxb*AxHjZzCd<MN50C5#1vY8*Lj?VwaC;qpP|(Sw6#t0P
zsc~1+ow-w*Lboqk@AvxlrK3)EeBj|2rL!K}4|J-1vi+6tUqV0h=k-NuB3!@T@Z7iE
zvwPyKN6(j;{l1dyU%Yl<?7o`T(kHW#@-NHBpr}tFTr<C<y^38}QK&1)@#D_cORZOR
z`X=e<ExigI#sj)q++s;Btl&T{M1~4ms8RZ2;M;@=n(aqd^Q#MXiU=)y-MC~2uywHH
zEi60~3f32Sw@0jc$8WGI#e5gDyRzWkzU9EFms~+e$vTU%(OXwz^3q8!MK&$?sM_zL
zWMt74WtQI3t_v)0x51+Lb!7JBPu?q2jQ+2EEgSepVe09JJs-B%o7MYxKNoSdV}=@b
zv+`}BtFP=#mzsDMzW56_1F{!v_C-<wT3W&ej$q)hs5Ed~8Q4MxE<NdmI2Slrc?@D9
zaA`_1BpCyjuNXqgP~g%P32;UNwKaLb7J`gB_MhR<d`^vJPDh=A*G@Bdy85}Sb4q9e
E0O!wlo&W#<

literal 0
HcmV?d00001

diff --git a/example/ck_tile/02_layernorm2d/misc/pnorm.png b/example/ck_tile/02_layernorm2d/misc/pnorm.png
new file mode 100644
index 0000000000000000000000000000000000000000..65a27e8751fa316d585c9b7d0340f3c425a71ec1
GIT binary patch
literal 32113
zcmd43Wl$VJ`!5QC;Diu@LkJ$+-Q6XyxCMvB-6aWu5ZoPtvkQv^_uw9MaS873ekbqy
zzjf+<x}VOeLlw1EGt)9XPxtimlL=Q>mBYp$!9YMjz*dl#)<8gbA%}o~sD$<cIHHN!
zE{lK=z^@=J0rH+dSn?81G9g2CzSgG=w4qFjLrY7NjvEBae=PXG!lGcncxaRuTO>^)
zDI2H7ZBcQcr6H>=d*pGveochWd4zwobN`e-b6CF3f4z9_^T+K^z&mA4O*|x^9dSbT
zSTOLrgHk{O{M;czP!JG;1I3|;2natZ{(B1b|He~e;?Py+hW`HkhK7clo16If_}*UG
z3sq)<h~1J&k+09ljLdIDlF}wdUL%N;mI%T{1MW69Hr)58VZVNHDyF<tU;ec9fr*I;
z2WK>qLG$g~w^LJ7vvYF{k26RJ@{)y+(D(hDLn$yAEa>6pbpKDTR9G~HSoXJXg_r0m
zhZhekfj%z^OOe&F=;^o9dF+NqM(CAO#dm)pAoxl|LA3Ss8Z8Fmbt;T&pLC?9EhdXp
zNJvQDym=ED8JWna)!w%uE{6Eyf=>o)W79u6%Erd_3`ZDSVjv~W2)MucSc&@r!B@ru
z5-L02;&Z;XzCOM{EgM-`U(d(Q9b$_wj@#JS$Y$K)?CgAVvMRMN^5e&kwzjr>*{D4C
zaRdaT{Bn?;y?rK+-SqwS@%X|{R9_6m`N@fTlR5%BFjV%-EsYHgM6E2ewD<%BysjJa
zii8*lV=wm<d4TrK`=WFCKw4TzFJ6RVlgjSCK^!Y;X;~Z_W2d1(Ze?L&BE`q=85~sH
zl|)1k%+CWUsj6mkT4H<_Cld0sS5(9xBqS_ULKa8G#>N&D6zu5e*g0(VyUrHzOP-*8
zf#5DH1F^EVKiJv|T}H#g0>-?+-34Js(#R;QrpDda_#Ht+!1YnqmoG1XN)UE}^)+C?
z@x=xgZ((B@na!C!IfRne^P=#nsWG-c@w7@UEiE66$PmO)r{ck?s;UNc_7YCQz85xg
zzbq43Um=K$Pfx$X!m_Zi7#<y!yYBsW?QbXG+M=Q&R#sLUzSGBhFJ4|=MMBizY-D6)
zX2XWptu9vwb3m6hn)nfdS4wE%1%-v08X5%!1@Q?93`PXN1EzWNe1F`2cjqf2BBG|I
z29!oPA+0SS;p6kt(t`i89C|0>D-FcLn2@FAX1&u21{xX=VCqfG|E+L#b~f<g#?;Kr
z_4>#N2*u!KhsB1|-Y8NcEEI{?z=8kNHb0gtMM^^Q9*zK;tupTef{|E1up33J4<*<a
zn4V*V$;ru{o}N2VKtF0O|I=GuV7^|w1ZL|Lx(0S|TQCSISWx2MrND7@dU+m%JF^o_
z>}!sV(8;cPig4kxri}W%@18E@FAAt09Z0@*W1X%#n`)knegPr<WJBSD#}3Z)r8vZ7
z*m{UN*>{)UqONVQNh2S|TI`+BJQ&!$yqp|}Z1NKGA@0Iw``wQ5uTi35kd$J}G(<@G
z%DYV|{u?5NErUQ`xw!ZCP-4U&^I`V4n7Vi!(evE!pszuHJYbZG<?j(bApfUmH>j(1
zfzO<NAxnps)8;#+30bXl-OF{^ic^M$>JlSQ%9nL+-JY%B2d8kmwi!l8#Tgug#1{}0
zaT6?KDEuaUaA61)H4W2j{D85EH)taHry%cT>F#HPUeh2k*n|TlHvHO)0Irl~DukR)
z@t(%e*!C!0U*qb~z$a?#h;LkOGxg381HtCs2nKN2KteI+^8S{bbh=^Vq=UL2hW6r<
zSNT=5GIp1XS`H<#2`%V;{k^j;tGR4$z|+*9j8MPjXnwyu-u)L$l2T>ewjXbrSk8lh
z@(!r3_5AAUPk;Z|<Rl(GzC-KwD5z`wR4rRrK}o5)vXV%^)uP>j)I{s_t2w{*IaJoH
zcTVq@_AyBqo7bqX4S&m00-8%$(J+5pWaJW2?0VVU5wfv|C|#$oqeuFm#wSwm$_+<S
zgn1-jD2~0oy%m*|^mKJqGx@9ht{t&ZCK=!plao=Ak#F$u#{J4o+S^RKLJ>c_Jnaf6
zb`3NCo*yMrZIt3%<A;1}!>41vtt7lX;diU1U1#w8J*ZVvafH|8$zoywX{~aR;sFx~
z$epDKLN-?8mhVpy%i20Ra$%0BIPV^kz`!D8H&ZU-6iLDZ9!h3GeMKgIUh#}6G0YER
zHh;`!cZILF_$kdTh=F#?=`9W}vn=Zzv`c!t?ZZ;!DZb;?V5zloyCM8QM!5V8gf0k}
zp<7#9wY9at+G%EHc6D{Nla-yF9U2;n{6fOOfHa_UXFMO(X5V4GN>;Vi&nH`!oIrpV
z+3~HOcRm`E+eyH|si%@;Y&<C@ed^09*cSz!ljDDL0?Wn41q=rB@bKj2<>~3^?TAQ)
zVo7AM3Tij|oYSFvC2mle5#}axx+qCkxHQvFEfl|J;FK(@&76RMEHdZL5A62H>ER(4
zi+iRT*bhp;fV@}p_U0F4?pk$jKN^R_W#sH5BasB1WQO?*nH?VA;m)2>O`59SV>aQQ
zvyG(H2)!WhZrt=E@yG;alvx=dB!K_s0AY+U2P|h7mzM_ZLX3=z5)u+S-yMSM736<?
zO!Z{`EU`>?^oR@b6cd(GJl;_?+}7#Jvr57L)!J2{@Aj&*faIo6T?r~6DEI~q8Mr|Y
z4-dtZZ!gB$Tie>Q6-6W3`-X>ydwZ|0kEKSYD@=m+r&%s~Q`k&^7XpfDM3SS@H<cDU
z`Xk>YSBYIOd981U>eFrUq1N#tWaMrj0y_Bb_Go>L!<@^@{h6g^Paj|3HBwRrqvR(V
zCTh2fSq^e?az#RmpCAy(ex=RGz`$>FFzIW}fa3GznUW)+f^KV+s6g%JYZ?s#pWfLW
zZ~2c%xI%%NpOPnAt`fk@bdq;LJP;t1yf*rAXPUj*1MX${$Y0{&O-?~Hao!1X2R-=%
z>mnAKJdYx@yga-7XE5y>oYXY*lXJ}`{>LK3u;%0K?Y;H$*-5wEi9%rMe=G_2;9POv
z7~o@Auho}S$uXBQM<dw}dE_;Uq;q9;WAnD?zrAm~kslgxJGc{tOwZ2pI~^^SMMfGG
z73F>TlK3=vkKJ^?bGg>jWj!kVPFT0iU!X7o>5ig+gx09V`*iAbT~9)Sx?%UkzJ?U%
zU8J(A>S+H@cZnTLjjZb@2SZQvPI?}xkDdqD`}%nXUxO;y8savaqwBj0(&KA9{8#AV
z?}WW8=eucoM%6&_-zS3Hp1H(4hxw?e*u;W@XKoNMf<gHOGMI$X7N(7ETUdpdobU7A
z;8_z9Va?azWl8M~DY+c9ES8ZUKW_YFFnxFS)o&n+TP?w}k2nJe&XF&nwJK0bX{zMm
zBfq64kDkUx#F+}sJ2r~f#Ep$>=i9|aMT&}w8cobUb0nZ2<BLK=cXr0F!f-U3-Xl@b
za%t+z761GxE-oHCVVUb7AAbA_27jf{pwRqvZFJ<0-2R4W-*&KAlRe>Dt~)b*S@dMD
zCF<M7(gGcvq|3|3mle%b-I~@spGl`&8VVJ*cL4TPIN#Z5X&)FFf#5tlJ4<B6+_8=Y
zCn7^+HPMpNaG#dj*zzXC0&evq86Tr7V>V<?qloEw9gaF_1>NXk_MT}kx^d{=ZocZ{
z<u&CeSYbHPdps}e?Pm6a?Fvb{X!yOL9);G3vCtq<UZ&=?sQl{A-I+E26{7B@MVR!{
zZ>YaXl(&H7=g0pxi5Ck6Imroa-v{E_7B*zJu@Y^q;w5d9makNeI1iRQWt4Zw9VVrD
zP+F(%tgXK*-*R;jODl;GsaEg0!|pQ+6OX}O%aS#fkIPTqsneM}Jd>=$@dHE2?TOY(
z6qfe_<$1*O-@`wkYedcC{xA_DpLH2#c``8HzeesF7Lc`wzoFqcVjInAVY4IQRFG23
zjLnKmOVR{go(n`8#^2pLjOzU%T@gd7ze^D+I(~1A)>b9}_1MC?KAl6KyxfkznGtF*
z;(oiohx*rpV>^Xa;8CwbJy-CXY1V=YH6o`ufd=;22Vkyz0p(lWs3FSD*z-T1*}3VO
z>UwVyF!SkG4a!@SPzbV9NgE6>=G+E5lZrg8wY`nPm$cRR{nhX2n&L)|@X;m^aUJa&
zOx4jkCH$gKK@N;p;k5KA<xfQcjc|dWm?&qFq;rQ=DUy}jOJCpXrp|&J;YEI*mlChy
z>n||>*I3tr&Dcm;=I~C4Rk*P-V(+Hcw{gm1x9(bdPhvp#5Z<o)ho{H7z&Rdyyik`5
z`;5LAn3n9tP};YBrRs@^v%xCrknTbmxwz`Hd^TXMjRJ0}Kth2>($ZAl%u(TGwEUjj
zNdKUKf7!)GxNy*n%Y-8o-kS+eb9y`@twQf&HFe!w)tVeQ$jY8@fY#dd`_v45TCcqU
zhae6?DxYotx@bA-3t5WpLS*|fg&%2lg(fcZBOjp6k#FTLFShYGk+PiV;dOpU!>=i$
zjD{?m4VESQ0|Mun8Y5UZ&)5vDr4@~y{UEZ;N+&Jr@8WgUd$?s)?cjn)FSC=)CmO!Q
zF4VU!7o>2-BxPAw_KL^F>^kwu*nUk1KdZOKTYb{ReioQL427vV>skp!Kj>7WSNNXo
z#A;K#GJ2HG2dZ^8$Ti#3lmGN-YleSswarOehZ>fx3Dff!#@GDXDtgP0BKr<DWGZSv
zYqO8sRZvW#p-GgG`i6qS|8V(yBR%I4Jyc5HFKJF#022NNqcw`zPPDi;@O#k4nmbq9
zm$5sr?DD3y$Rc5s>49U!07d+ISLX3;uNI$EoJ&JuRos)m&s^Fh-D#Zq__H<?SL{{~
zCh%5d-2J-u<r`&N*5rM*4JMN_zlJ#awV$hTFJMc_5tg$Pj`qCGAolA&LiP9$HFcQN
z<39`M#k4C5h#Nps(EJ$3<v>n!CRU-il%%w8X=&fkBP8S~De|SN#{&$lBz&|QSKG6r
zZQIy&@v7RI_?fu4o`nz{@Mc%%kDgMnp4VoHRf<SNFXNL3vriO}h4k0gFptSOmof_)
zuR&6)oQ~z{D+lLkzkioI^K(CZf5DP<n)W~)+x<dEb0Fb(-{7fBbbTyrp|m#v?(eMk
zYRitrI0@I_DQq*<cj9s@Fg$mu9f9GLBW007BbvDS&+hKX+;dzTB<KS?_U!M{ykF_{
zo$(CCOjC^bjkDp$Bl#m`CMDMPZW%R-3=B@~Fx?NVHKV^dDtDJzu6Um!vh*0)`5{-w
za3)(6X83nK$7DOI#tUtyI(4wv3G8~AOKzq9jcL5-=JjvIhKR>c1rJq!TJq8Jn!?Ji
z?Gv{iG6FD3lzGgi@uPN=O|?{*^GI8%$O$8lu8!#$;Obgz*HD&*Sl!XzMa~y08<cwU
z#1)sO#@K`&%8OGX?YF<*=kJirye)-nJT-<~3~64HC!jezTjKNjuUS?93^OX-qZedE
zeZhYbAf?9oemnS9ECDRvm@~IHl(2~8sqj>uqp!2{(Jq^py^@hE#?PE(pgTCIUQTvX
z5(Tk~(<exJ5wcq|socy-1i50Hwv+Sm@sSF9^?2U>@oTF-GDW>z+-czps!9I!SZh?R
z=5jOt5w8~uLH0q<ZcI?c+Rjc%WC>qZbd9^;^1hp0@pJ?(5x=kNz#@mbap~Zl?C$$6
zMlk(a;HQQyA=+C98m9|RG$4@=c@hE=DAnt0N;n#%?hIu{Bb_VU_D7@F6z@mZ;x}l-
zI&Uw?;$JJ{P0Vwc`oN=Gn!sa+q^tRLwrR!`j&TpplQgdB%qj)*+oL(?$Y1`hWa_8K
zy4x4ojuh5!C(YM>H2WifA0FU@UdBi`(dCK3GeatR-qDet-6MH-k3*#RaA!(Qxq7DP
zHZv9f%V}lMF2u7vT$GXTeS*~GOLT!UjfqE|b8a+<Ea|17^WNTGZzM67S@-vYg9FuK
z4e|{T^_IY>p*zEmU}Bhgj-PSX<MB$B94HA4<la52&O1jhQRTc$W#k1!&C^wwUky2(
z@!uZBuZpx$>_S41I=(q3O}Y+Ja)uuVra7zH{v>3JM^f>vu^KDq5)%uWt$Zamd87m_
zoKrE{n&O-=OK)1D2k%R}UPtL`j1V_lT%N=(bR@xy@D`?2_&S-3Js+!96Ruc{P5A~H
zM|)Ds6fo1}dALV`4YI47TVq4NAj_~*$ObjzLg*sK%E~Gf+e1wogSp-0&j&Aly1No(
zwMT~BGjmUgmi+JzI5+D$VxuZW`3W3m-i*^owN*vV7J5&JA_^~QH%29zcR4vZ@xQ-1
zTyFI%)rNte$KDs~rxmAFUJmJFg)QHzkIG~QJ%@_qLqQ3#vGQta?cS$rf3k#{15u7H
zRc(0|n5p(J@LZf;$zM%SBqV^by8ZpIZ6hl6nV*}A)A$J0bubC=@nx>-fuUw*W}aVB
z>j0TA9o<WBM5p(xmnGZ1GE40(Z*pFdX)cZE#?>hEz-EgSJ~_QoG;UJjv9;YzOibjm
z{&TQUUlHRZQ3fkBUESJq)<t;^Y+oF%wpCf@L$wt6bLc@|`lnA?cN7+#xPL)ulKrk0
zX6({iQ18OUasB!7V@1uVK$=V>S@*?_VT*l*f_RezQ7N`#t43i0y&ACVd%d&@Y`wri
zCDzWGhhZyEK26Ij>7kHiZx@H9e#8nnU}9wOIXTW&eoji_p`(*kRNNb-|7)BcG1{CS
zQ)wq?CA=`TA1I~=>HV#KRL%xIEBNx+tJW(z;+0lUdUf@AkxEABD{>N2(g>^PihG81
z@KeQzH(ohRvp%8}Eb@8);=O#?`$r*&jJtRr<mTeS!NX%|YU=a+)LK?%FV@P$@f<&2
zuoM>}R^C6u^RS<#0j|5HF)GF(tn-LBo=eM<LuoCGkYt=R=}WGR&Z431C9ksW>?AFm
zs7V4F5&iZpRFtOr2clG%v?s^Kp<`f3hB?Bnalp{k{Q6N7a}#+B&0hmemfUD|$`D4l
zx>7Rp>FFsh0l_PDblg4v5q8PWRmfxvJ?`vOh1E;P-B!<jWu8!s`a(QyKOIwgdmNSK
zGD*vXUI1PS3=E{AqOuuzxU2k-1(uBLX1n$xg#}*KdK~t@#x*yoN7~B@jF6pygig%N
z1f6Y!|NQy>Zd^H)Jy&e`*V(d02Yi`_8JtorHQaZpZQC6=IXHp3Cf}?|!Rj7Clnyp|
zM;hNYPe|AhC#vCqu%DzK@NnY-OlFhzK$-h!LN?=?E9loW@XZa^MCp;m8$EbgPFN~d
zFAjYP3w&^B2p=Ec&D9l1u*L!+UQzQFO!>6^-uj6`cT4SZ>>$tIqUHz%5Dl<dGu4Um
z;P!;cVVbzny_Kb<rOn93=4KQz7m7nBEQKuJADa98{H{9aGStmhOdr+Wd@JjFOWus>
zE^BMCzNIDYypx5MRbF%F=g*%X+0E?Gbj3_hVQji-$@>Bq(-RpLlYAY_-yQH362zWe
zq1YC!8dVPu{@B=2)5e##=i56C4zD$%Ui&HN3<zZW7T#Q}_G|9Lk*!d4fP^wKFpN)3
zC@L#&tgW40US{#wX)hq%@HH1$nG8xw?TuS|iM(lx-zXwMV)dSscG#9gdAba2<g0<$
z_Q*%9t(mE)jBIRJI66AIFBX-RDdR(?{ny^$ENRIh4H`5t472yZyuOTB2C}*5U4zVR
z=C;SCr)47s2L^Tz4`D?`SA55zz`I6&Q_^Zuw|ZM(M$ph1=l0EN7Hx@8v(&cwwmDV7
zwzk^Z#;8~(7)if^Ja^!8*X3$A3DP(5_x?{0jkWR%bq+(r{jb+oi6cSO7G!!wUffZ?
znOnJTKNIWO{k$*Zfscud0PtzY>^s}<(q<n&j>k}ljV;j8(f#@J2S5c(ZBFzBtmYmS
zLvgvLQ%A_9B$=cXjh9-;^?qO_8MjG=CH`sNUC3DA>ujfzB3NR$G+Ju;=>){SO|p8E
zXj3_{rsgDo&nc^5_h`a=$R77n_q1ao>yoI~RSp-ON}eBRS62#2(qEgj`RA$!Bp^z>
zzQ~sl!@anIYLx;&ji~$1m}sl!{^yEo<Uh3Kx5lLn3nG%=g-RN}-TujVCC7MD;ialw
z&GO#kY<oqe_A_iIdhAsMR0D=*NlPn0nJXdBBU#kCHc{$=jFS8juQ+=BxBKba;W0>t
zzbE8k{Zj)E90+Hr5pp?cY3cLtu%qmwez+Pb%4+j(&c3$y-lP<Dn|&xYIx@Q$JIPH=
zcW72tjPg8hSx<ZK1PJpDQtr$flMauJxN>GWiBG*Cz_2B!r77fM95q)~vd7EX%a2bQ
zp%^$7SMATfY)VDT!&e*@E~PTRB<Q>7TRVpaV>4cHiXtw|Q~ag1%G5}%qYF6OrckJr
zsF;c7=H@P8gpZN00!wmeD9XjvT(z~7lqAX~8V%)EUisPOLGMvc2e&`;62Fgn^3JU>
z$PUV)Tlur7$SU|*ob&Ig_XTxHX{nUc@k;wMfFg`oW0nS0M)vhrCnFAZ4-bE*+zd$n
z9u<Wgg4osH-6^f__2czPR5zWr%68uGDC)6iHiNpdnwrFk%Tv-j6$!f{4{_(ezdr-W
zG?@RzZS`53=wkE4Y(+^ec6B)H(rcmkYGtJ;y}X=Ni(Gz3^<K8wdh}h*zKvr|Yun!8
zJ6kwr8Zt_+i`8g#22KRL9Y8tT%>P`^%Wh((tu_bZgehe^n0WgZn4iSi9^w=wh}@n#
z70mygVZv8ZRGgTYU}R;@=66{GU^nEKFHvocu_>C};=&yeJ4GhX#*-mg(T3avJ*X*>
z<1;hy>qpz`@gV++<cy4nqn#So2s8|gbYUOY1te_5{SXhwpq?MPA%T_Dmf=)$=im_O
z3|ox8H|S({*AWA^6FBjinPl$Hp@F(PXm4P|=;-K-X*%~X-lM;Jm&F?U27=7JRrAFr
zCWg#o($DAMov<&w^9Kj^#>UIPe~S|n6TdO}j=4ftexx3}In(y~Z>>>iBeVT+URLeB
zY7BpoSCRS<Eqt<}a1gC0R}Rao0OtSjNvhaHb;>6LZP!cSR;q-Bt-|hsT8Lytmnkh!
zC~>Xqysw)h)P?B!Lo{ofUQG%ZNw<QVPDI2=K>-4VVmuOv0NBvQ&N$ZBT9jW;QBmD@
zs1(FEy!>L#<(%G+47G*&z@!GA|H84>Lttd8r{u>gb!OAqyoDV98LZ>Sk$%<MCVHkm
zwsw=x_$#Do-r8|Z$!7!v1gaUluz~_<Wo0Z_+H?*`!w8xi6X;)1Khax(WqVZ3PR8Z6
zh^{~6?8+cgJU>%mG9H8C(1b^)EG|Z3YHG@9tUP?qKDlrm`0xe^zwPc~Y@1=-tAW3a
zpl@q&rLFS}-SleSQZp34{T#C*+tTf^rcHjErc+K|HQW4pdjscebKZZPMZ#l)7B$rJ
zE<B_J=}p#gaVv<l#J7-xpUm;@Vs}T^VY*E3>FMeDamLg2R6?P4Ff92zDqD;FheP0Y
zt@+^lY{y8b2DU`3&-vE3u(15*CY$52CY^EB*7~a8wgtZ78<o#`s^`8YdGe0X+AOc&
z{@neu@r97)nQa{KNC^ZYBQ4!OFp$f`<+{;7eGg%Wr;jp_`e=AWETH<I2eA)Jk~+W|
z*_@iBiy@(&e|A2w(9l?%Z4CV5d~~B8uEoKw%OSFsNn9r?<EivQciTsAt2p<UGG-54
z>DmSctBX&(;KAX<#fb&5_2%>k%ITB<1{m!FE_u{PxvkVB1DS=<W3#eeURNxf`<zD{
z?4tQ#IFrg8z^632Z38H`hK7c06v^fO6R^set%_i0;hA1`ZIPd0>+^WBhcyHH=@eF@
z`C2<dR8%<tKGqRxfrT~Y%y9hp56B6b>6NvD;%dY&3{r$svyTeqylYcRP4^rdOAfD7
z1GuCbtN9ocRC@ULO&?qScMuMTYRuL`5{wjozx?^ek{`et-kCT1MR@$Hdm31$+^`CO
z@p<iaB_%t%!|_nP)<;&*OsjvIIwXjp#eKItI9z|=+k!u-n(QUry$<ijrIrQh=!gZ}
zf#l?hAEl6Tz#$<+O2tA}Z{&7`U-7rQ|5kgPMbs)x|NdRx(eVfXUHR;1$2Rd|W3Kkz
z5SU&@;?=dv*XisZ8Ymb;SIu!$H3gHDKWX!4pG@n2fs$~fTTqAvwglYQK_IsG_o&qH
z*f)gdn=x9#NLcfpmw`p;D|XdK`4kL3W|lRb+vgQ<HD<0?la^4GbZ!PNuF3Inm-DTW
zuWhG-PUe&W*L4Fqy(i@9*_dC*ru;;IAVQ{_+oqT<C0<)tVAg#0)Nkbe3(>v@cYRUe
zi>QhxJ?cA7Cf*ol1-hS_s;i}iub=7|wDZ-!wN}^vYY>msM!e8KHl4r7b(}?^{-&m;
ztw_o!TE<?_hPo1?P*r~jI^N#?{&>2n<>3jOBm`6%EtuP|+of+(FCrF{t0-TU#MctO
zv_ou1xvYt!lk%zBYAJj}uwXOjA{`9Sz=ef{x3{+k2S@>*qr_^+j4Bn9Z3Hw?D5nhJ
zGPFC(zl>SQ5g^m0qHrcHXh{hXrv)0R!FG&quqHNxzYU!Se_p86W_r#!N?V7qy_6P2
zTT}Dvk+HMu6}KS_dzl#)z(Qe~UGpdtxpeSO=nba^HlcOC@VU1kQmE5r7l<$5O>!O<
z6nN+4KR>1KKNjLZLe==0TF*SK&I`-@)Oglr2uWvjm(7<>mhJi~N|kC2y{3@NH{v~Q
zd`Furx+N<tWv4|8N0{Vxc;H|=J9K2p95$(QomeV4mT`xQqJ_<G_7qoBAfZ|I?vK^=
zS~~&qJpOH6RK(|mm*(0ck?o~?O(ud2VAWdS{#14WnkhxL-T&O7^UEz>;H!ZX2GmPz
zHA`L<T_2&-%C)~bWLkwJ3f~)FofkU>eS~vM`(8)PY+GGA&#A|2VyAe!Mi1uJ^I&yZ
zW%?7E!qs$QNst5NYw!NHe%VqRO80x)t6;a3a5dl&;EizmM7dE8*Ry^emF`Gl-3<$R
zNu;r3#po-HQR+{#z4c?+vn%#Ty|uNIt3t=P1N5V|(eQ1dl4_7PJE%m}RCn8RQvT=6
zSFD0!{mcAIt{7>6odE@umeAKN_YI4>b_WfoJ`)&Wi{)SOg>94w!0$j6+=cBLFvinA
zcL6#rn0s@@M?P-Wj~iQU$PAdXw6*(s#KrR-PJJE)naI%gabsw%-;04_xRriBakp?s
zH}zVbas8#NK9q8^Be~k+K$DaxwEqAnz)vxJFR*x8<d-IlwQJe+Fj{95Cik++@Xe6>
zcK^A{ug8ZtUpzz;9nI55c6w1!-lEE)TYg%rhVXzLqsuyd$Vfnw<vC(~3^EE`ZY`ms
zA^hFUzW*kwPW4>za-dEKh?L0Vo5HD2gUcDl7qSB3RyMi!s0bd#`dpm*Nktc$P`KC<
zpA$sQ&Z(1`+o`!asp_ueKng{czFLl1B=C53LP8+zl){DsJ{HZn6u4SO-m!^x)eaG&
zxXpnd+*!xd6p(1#UQKC`XQYfc@vH;tf({6GqU}ZjiD*yHP0h^x@S>U#^VJc$)vR!e
zIx3!{f7#GGrHX>Y#R_<c2@Yq>+qVfH^@2tfn~Nn#HMZPr3SGwc?B#hF$6NzyCnjZL
z!OxXXB&E2nO9W-YxQ26AdS;bnhMB`+XzSYKP2L`-$cii#xg#JuW8O3Op+)?e$slgi
z&>tkTB285vw?-N2mNl?bE<ZJYo*U^tMhn%m>B$;vX8LBcHEUifO7^)Rl41^VTm=lx
zv+yX01tTFc_?rckgOd&g6@Kt~0)X~jpHJ;P7F#KoYWcBPfE-b(0J60$%Zfvtq8}M8
zVdMlDntLK}v`=X7JS6!Qdx)j|s^)Rvl1tW$=;SIEEUy*cS9REYNF9N$LPUVLG!%Hv
zrx53Sh07?#1(~Lg1-oNuM8)3)VAE8!wUm$urHA>U;b$5}hH7HVFk|q4`sBOoK%5(n
z*u^dB*t1N1KRud4_T+}{s{;w0+6Zkl=J(yZ%3$-p(9&hSRhglfW5p{Pz(WJbdXkqA
zS%t4YEC(HO3y^87lx?@};ZD{I%^_MayKWtaCii9XpBx?;@EOe_`C;Ss7>KKZ)Q&p-
zaaGC{)IUj&_>scyJIko$mgcg5$8BO9@(AnvHxLpx2GHC0a2s)tUww_##TB=O+BGH^
z<7a+%<!!=5vVX$U?XKr^D?SoQyhPe+`})5Mywpe{e!2*car#|7w(|g%)G_T9)pm@&
z>Xd1^Q{twP8|z?G{~x5CMxJMWQav*{HdzOBaXB=@X1S6P8ns5+#b<@p{<ER%t=Pyr
zhXaiNLHlcT<+jDJww7<t6IuI%y~#<1>^291Jn-GScL04C>(CX7O))uXve>4w`vvR!
zv{CuR#f3rw-N*lzg-S@6Y!mSlMMjQXU(a{fyUWQfw46?1J4HuNWJTM-G%;BzR7!3M
zdhi~sU<#eYO=$I7DK4~paOrdrM+N8>NLyQA9dvJIG>1i58Ts~>!6-9uGPrB$`Wh$V
z2Zr~R{n270zzyzTB1?>cFmW&!D(%yV^in<*IeH5(Gx6W3_&;cxn5d|z0E&(aF?k-0
zQYcg~@E|oTOb$r1m;4C;ObI}4!d_8mYik#|>xr-M@bVhB`QI)tHCwK<jjOC%Sed1B
zdhE45-UG=$AQoYN4~!!M!Wa%9zCb{@K+?blejWi58c<$|{qH+9FMIRG>iMQm@%&|e
z6^%&y)@G8D$@YOEN7~m3T2v4U!_`Xj;@Zc;1IHA<h2qM~tBbq5+1KsGY5_08Wk0jh
zQKOXUHpEZ$6OLw!VET`=igbSTLQ?^~`U#Lq^D-a&11%<RD~8AqbRrTSM7p+p^wW(U
zxtlGWXaHSMJA!%~bOG~r8kL)yBm4mE{*zEd;l1^AOIyohNy6w*8KWc~Qbeex=~&S?
zlKT4s;l=5}Wg(giqYkeE^se%t_KhmaZ!f-G3?e3?LS*wvL>Jko+dEyRo_k#pWzUR%
zBz`+EX*o|wOIZNp(kVWZ2kXB+KO(Vnyo4sto$t(@&SkcWv{FuJSVbEVnEus(sU&SH
za(ulnRC%ky!^woI{<eUGMe;{>S&n(K?He*M%S^EpXLniYIo$DkbBKush3b5+k|I@L
zOVHJ~;brXj*bbyurk~!!B;LPFnd<hG`HLzHM?()QQWcAn{%Hff1PP1@L$C`-4k;}J
zSu-2Y_j|wZBwz%HewUXQC@<Jy{`+vX3O$|5vTNADoIgKOy;kFgAuOR>ii^%e!;i=!
zK}!Fn_enHhOh8RI1tc)aEh7VNhXtX{{asNzpJ%F=*b1*?d3%4cZO=t>^FZhbPXVWU
z%3j{kZs+|njby!q%6mA`Pah2ic^(o?JR7>!6RS(JnFD&JW<77&vcZT5mBB$KXs<J7
z|1iJ}iIoj)CoRFVY)IH$-9F`Y8G63qjbe)qh#M$`F?RN=Kc5<%lopI$O7#tt+I@)J
zz5DQ?oS%V;lcT-><$i@|+!m_1M%|QYAqU1-%QMqFXC#sfSJBL(?A5pq_`s**hVwbG
zUBs^8Lfr;E&Sk9^RgJlun3&yO7!O%4mRoP-gjVoFl$2jKKPHk!7V*=8mM`sjgKfJ0
zWUsr(Z`!Jc?jMPS1I&BXJ*g)q`Hwk0I;Rlk47TyQf<)c4cUMLFzHL-E$%v4VpmSmF
zMyw$wF0aqOevb-D9tRSh30}I5yv|KK=P8tno5jCsm`heGrs5M2r4(q!;!>3KSyuj7
zE@o}eT(P^dC)yVE!1%Mgf2GI5!Kh;)wm-2GOefmdsQ)8%-)+8-1i-YfW}2qkkh=sW
zZC<FIQ$(_{^|L^n_D$KPWHHcVj;6GIoyus6^j9#%jj4Ba1%hUTJAQu#<uk~K4^1fS
z`v;ZGTZY+FTyLY5Qtpp;z_gEOHF@o)lPsPI+@9m5h>T6QVPW*O&tx-{->%N-^jk=p
zPGr+Z_1hf7?7H_dzZc_Bi25Fo$Dc{{wf^Pb>er93J60qyAbDlg?;Z76fBY_eXz~af
z?OVt8^+8q85U2T*a9SKbI_0R8*$;uY+ImL()mlEFdwK%lxkj_?h`7)#*-6={BC+PJ
zzb4=Nk9-JQtY1)b>?iN0mpXB3o-?6gN%_e1c!zPavnx1Vx0o}Ib<O@N|66;EM<Eu}
z{+n;^l7z2aH1%Dj7i=>s)d{(#g@`%jA^IM1yLKTZxn^zc$9lF{;LYET{NR!`0k1*F
zlbmUzKvw9N*^o`6$CKKZZj~|Z?a$Sf-5MQ6ap>BStbTRxT%%a*AT-153S8Hp!g}(R
zjOUKH#dg!?TRdO4W~8aDFLLFO{}NG&x~kEuuuth3X{LsU*Izlczh3^$#_MTgsko7`
zND{5!%ga}x_u(;{`w&u;LBXoz^VI*?K|LB=|8miajkEmE?pP_crBlVxqAzESZF+L(
z7jaF;hM+ZPm2k2?RE8)A$aE~}){*b!fk4hdC>>qZ67ImZ_0KFnkIwZ4(cC~U$986W
zd$ai2Yr)mw3zGoR@TjD25OpcS22VUIQ*q1)9@Yza9{n$g%i~!Me=3YE<aGCPh9crH
zo>eB3mzfjm!Xtjgf#+UUVDhEF8`AkY{{{{QeEpc!ZvS8XiEF5hfTalE`o2{L+$nQZ
zI2!BP`=l-LyQ$_>*0soC*n}&KUin}FyVL0J8pWEgo9RBB!WQlJc5?FlP38rSb6)*>
zeWt1r^PRuwIvg9Xj(%%spL$$>$x5jsKc%snhcF$mtX$FEZPZN9XouNrj`_4R((T1s
z?Ca)s*lng-UNcq2-zx}re@*Ou*`|t7wd8TDg5N1@epp25V!OtqvZY3Bbb9vVGVQnI
zi~N)1--)GeL7p)*_=F7TjsD|eloRWLAcnZl9&IDxxvL-7i`ZKlcC)mF+}pCA>o{nM
zvr(@_2Pr@G*%C87+;AFpeR@*%G3_m|TNue&x2x<VCA*$lIU@pps*(-h#5rR)nM%%z
z`GroF{%T;9*J}p*>et-9PvE}XOsf#Ibyn(PkE?jIb0ldt*2kf7P`h0tboJccL~Moi
zkF=zUPMfFjdRF#h=cVnQSI9?ayFQTApcITdWnOD<5xdp=!|Yk0{2bH%w}?NoGwz0S
zCcD497xh;JdrfjDroma;nD(#0pX_tM1$^4B(c<yC2){@a)RLq;&yl|(YD6V--=tye
zH6^k`=B>gGZ_MN5U}2)qgY+wD7O&c3L|A9;@Jl@m2JO}!&8NV}F|2}FeaQVfsMsA2
zxyX)PsQB-OeCOFk&B8VohXjyi+hndfaCnvKE?GRnw+Tm<D2zXUY9!z<zAKlT{1NM_
z?N9_?FlC|$XejPmax^eGVo5Ms*Xm_A`>Z)U86|0BgX=b0`r|larbTPRdhl`IcE%$m
zNc8SZ*WmVk)*;bohQyj6q|24zpyfC~)=zmKH$T4Fcg-$<kwenp>tqGA_Cnd>Eg|gN
zf5CjJPXYyXi#Uxf&fIl^m%I5NwBcI%c~V5qY$H9}Mk8{%G`v1x^P}4iLLoZW_N}RZ
zU{qAeN}JOCT%`;SsI###I$Pn#XilH(MvAhpj{N4CITj_2_dg#jfpEG^1cGDc>4o+U
z1@BB@SIS=Fn>4VZ@w&6IPm4d~v}U8p`84*bRZY|EzGnNK1$23Cmf>(`)SUQIRs}$L
zkjwBU8jT_ki5Pdn8h!+&`Ms<QfW|HB4k++&Vu;I9@3y(VhnKu5Ai<`z%Fbr+c>m*f
zZcuwe#4#V`>9Bq6R~PD;#UQr;1%v0wzqZ}=i}~r69V2GXwT4+H<5v#Wi2@;N!qbz<
z=;2|-jAUw5mHUk;-sLv+mXZ@6WxnORc!yHDv09^K#8jN~;G687U5a&~<_ITyWR`Ww
z=qrv?4bicO>xLA~Ln?XM=CU~-r5P8>@K#4*8-Z6>Jvgd<iU$jHk2(<T<2LuiDld_Q
z&Vhn@)b1cvTwIN4Mpc0~azwvQw5xJ#&tsy&{U2EMpLgl4`^h40Ej~6T;ycdy1pkj(
z7IBr=_n{|Xw?dVep@~y%mWph2v5Fb1;XPN}%Na3qX|Gnr#-CG@Am!@}nYKk;nkojf
zBDcEH^tkrpr`38@>6O~IrW-4NbNq}VYSN#yr~rPDkMcHkt%e%COQprl(0yLb2hb)$
z!w;kF4-&zhg7xiBxQxra4|k5_Qr|9m6|JCg`Un%G3fD<U5ZNps>5yNr`Dt4auCuz-
zV>HtH{*vdCne4#9gw5Y2a(i55KXaWl70$Zt&Vd_-IVb#aU*8n(SbuHCK*S*})`m-E
zRhR0sd79iQ_17yAC4&iGGdU7526s5d%KKkCR=C;|dn#)X+pM<DhQBI!wON?q%4>eq
zc%zKJ*r!BR-nZ>oQFivzwywn<ra6Egg4~ZD)4@T^3wGYBsxIT+0);f@<V*cfxB3Uj
z07WTD9?YUyo+os^IYO@dw8ul0=b>4IlaY`!y|?Uqs2Q>}$;REZVqdtRL5Y{+%j<Y#
zYNU-_JyCV7Hav#Qjn1=lh8kfb<aN-1UN!BiSggSjRF_jeEgp~0Sin0ip<3%W?dho{
z5DNwY*;pvIumM}k&a{ql1vzCoAiS^LUsm{@N}^UaQh1W?sDVw<DCPQRlZ>huQRDJw
zF^g9Y7^~rT6du9y2dZD-m76uO(ZbD?#^D6mGR*@no_tk)(cik)WL=fC*FRddfALl5
z_|$ktSJ}O41Cr2w+FJkTZ<9h(FXx|4jXX)aV-P)OOoF5YTZBR_f31!uO~9ThVO@8(
zQf3+v3-zE<HzQ#)4?+k*t&ZpPt$Rq{g>A>-HxU-1UC(NmQ(RB`rc6;+W_D2Hte=Y4
zoC~##4jdp`yQ+5^Q#^_CujQa|v$~XUgf!scqY2i$fj3=2Y~53;Em*!6I%VOIS4NOo
zd9ST47beUa;Ov1s__X~Rjm)wGj3-Z$x?StD>lUS;xNEOe(_A>ye%E!^Zp00L+trNj
z<>QRz@ym8oX?2q`IBxtBbo-SNW%`z{4W-O3>o2AMSv|*+J7o*nO4FVF8FbQ%)hYe#
z{6;fn37{HP8309qWEh{5sM@Cy^R7CUxG7U$V#R71f|Y8g^$Oy1#Wd@!Tc_MpfN0+e
z&OF-%b&mCTDFs9d+2>JcMAh_SKv4Pt1)@~oA4eC~pp(tjGqLs+kMLK$N?y0-SjTdU
z6HbpRDr^vf2Gh>Y3>mD>t&l|~5vRs`klo?UYt!>sZlQ6~J;^OoZ>)X8`v*b;p=bS{
z9eoy=%dJjWEp`FtPAoS|an$%#UNJ`k2M>E(uN)$0SF1-uULMBT?)o<!@WxK?KgV7i
zXZ3=IUksvDBO5$Rt?3>PY5n@41Z+z@Bi}A;s#hzC{JRIQ1w+cudZ+G#`;TUaL0wn%
z5Ls7q00*!D6mhgk<x0rYH&g7RRcvU>(~Wq&?PBa&o63p_XSb0l&EFQ<3Pg(av%*AI
z2I*^us~ucc2}B$k!a#Q3O}-BrGp?*sF)v~^%bdKFN{xqUKFPuS6ZeZ|c3t_Bya-M!
z&Yg(r{>qjgH;wBSY&cjz%AlZy1+C&`v0nodPYw*YT!TBNb<Kh2%8$SQl&%giuwE9<
zkD{j19r~s@OWej>i(`nW-J>7PaV9~E(^)-coLZZQsAGX$uWraXQA2XWhPOw<@-V=J
z5=Y@{V8;To$>i5uqugvGHt99j9&2-|cyAq=;oZ3rjzOnJ=jRO?kJMi|LSvw5@B>##
zX2D|pp)bd+PU(O*V6XvL21A?GcpmNpesPUjp2DtGnmKaFA1mOAde)kx9R9u8w&hBK
zWli03y7I%V@nZgsem~^SNn9kqM72fibqyUBg+RON)n*#f!_!=ub1&)bY=XG8S72mJ
zVB&07Te<;yhxfHzRK*QbDd6@%=g+IDK6@VhfxEBz)JZiecAOYz&)4D^4PtnrVg#mq
zby7-Y-r%Td^2g2g|Asi`ocNcH9gojf41&BK-ZnH|q7F4|lhQFu_C0+uFt(`4*ez{p
zPF<O*&;AHV0cne?;uF9$gu$7z6AqgfjXqyK13;1{8l=x!xhmW|J!n1Qxha}{xbvGM
z{-pbZyu}7zVB$to&8EadF};-=mXT{l%=^N&Ww!ZUW_j#DyKnBtiRkVRmZ~WW4obDT
zn2!EeCVUUqNJ&romLA7bcKoupX|<UJFK+L=YxCmn>8YC29sDc5^&hKDqd3KldzOP5
zU*C5A+Pv5b36a7;S^3i|1bNP`v`u4dnykEWEKhMcxB4$5JtEt1aReybU3dUtds(*M
zTOBrQrxFh%VQ`d_RPn3*ZC{mSpXi(~`a))$KO)*h4@2?}a%W|=4@pgOVB1GED%?dD
zR#~Gr)NtrA^P1j%jCryU8EL@94CmdJOJ3q$4|9lhlv+oYj7X9Gbp|aBxlXz%-g!QH
z7ggoW#!~4P^noSETqSf=;G8^LQpeb6(snLrC%1^HH8O}7RTI%DMFZ>Ew|$XRjF1}F
z?T}}aMY5fb<BY`>&Ci~9cl%MsF?4d;JCK%BP2g<bYbwNo^c%zmfkWv4E253Pnl>2S
z#N-_xuP~nWm%cNnlhMX<o{-_*JHNGQ*zi!vHi)_r!Gb5=k*T->$PAbP$mEN^`&9BW
z!C=)i-~QG^^va2A%w~eN$W!PiBDsE2M?>LUN+wPkyU4v%K2_0|{p@-Pk3(|Fzeo+)
zbFSP{;$M5A9kNc)>$Qf92*9=<hwDIg-IZv&*)AT7f(iAvx3vs2I&AX<{f>L@HVW?d
zu?FNVH(Ztq`>2`twe0;{UoqBa)`shD*p=6IX1UZx<UGc^z(nMSXl3owJW*Qs3+Fw-
z#i6NR6}i}Oy^z}kN+%`|sVyCc=wMe+8?zK6ucKH3x5)eIT)$&-ZL(=MmxLJQ9AECX
zINP=8v9|_EIRWZXqhJn^nF_S?4J0Q#RaHx>iiVvP(n@3hQ`o39U6m0V-ZTH2&h4kh
z{=q(c6)E7Ap6x*N*v!mA?m+FCm@8M*cV?0cdbH^Y!N(cQ2n7pnig4}k3d2Pl>sa?v
zo36&av_=vlQz`SMX{2RM+X=TU0?2uv_aefq-jo$gb0cva=cRXN%ab83w3HG{7z9Qo
zolqPQj@Du^QgX6KW!DWZ+E+c4gtnm78<<iIC1_%z6CDKMCBi{iILq^#_$qVeTvlY^
zj~&0cT?fx<BcwA#d)mA&16EmYgAYB!qjXgVdgxG?c**tYBxKDWZ$VqT^a(3usZ-}Y
z$O^0oK2ZhDgS!Uj$VdQwJUu0IkSRF&uAbw=;a+AmIT^=uRnOVT%-UeM3K8>yF5IKf
zA%-}D<{%sDkn-956B)<S*--))X79s1s-r3tFNbhh^~_MDC?#ts$DGIWQe@h&t?r`~
zEv?*?Rkszl>cooI>9BU;pY7g~p%X#Bzuz^1ZSXUQM@!)<zQOBdKH;HAz^rui{WT=^
z&8t#oAB`@<(f0*KA_m%@<n_vC9%rO3*%tX5bx&qei$w2+zsR7PB;^IY#XKH%m^sQa
zVjk;4M$xz%JiU<#YeppIv5|{0#Y=biRB5$ON&SbAJ|?Gm3s-B=-%6G_tR(q#%P7oq
zc$F5`(5;4O9W5Ywfv(Ep6etpE8TqCBXmM_6c*H8-N17SS)7k&;<`Kci*%sIAaIg6@
ze5GWk`D%pcn}+qT^zQ;G(xp(lhAZrAG=h4jVVYvbVm>2}aof#Xo8O77mUmHP1${}c
z_kMUBRrxI+jJ7vW4cgZCgx_=YG)!~3amJ*Z)m%?~JYQkD_Ji1(nZ~wuB~%9YA=*e}
zJO{_QMx>0+=-F9v7JBqScsH}*p8E^9Is0+Rmk9?{f~c7|zutWmO=K#pvS?gY%<uW}
z0E?=kyzFhVY<_1kUDGn|Y};i%bI4aY&&VjSX1&X7R90=X+QP6NYN`M(LzP3#X@0zD
zEH*J4x+ShMjK@r7n_{n=<t)ChPL2m5JT=M3lMW!Z_V$T@UZnxc0g60N`d5(f0%t&Y
z#oA-^wl`R&WLQnP{{sb;u$F|q^unhCDr#VRO3f{$=kML@Sj}q@G3<|%i+TK_UTyEv
zZr0S;;q-c=WcEvwX^Yv!85Om!H%qQCy(n#SL9}Ja&yW_H6$-hz6COD7D^*?;f6G99
z#jWio(-#x2ZypYU4x-kFs?FBjT=MDEfL&3r4%IHK0TiVkho6$FDNDx6%)lTRnm<`A
z30Aod(aj>H$T_VIM60+-gmh-khN{`-@OMi$KKtXDseIHo)1Qk)cjfF{Wchm6xp<sU
z_uyiSIzh^mCeI@NOjNLiE{_6{jer^-=F~I$kDsGuFa)&Ne_*}6FvJfV_l%2~L5myn
zyQoBYgMd(iUO?i|X}zSM>*h8w@;iY`U?7k~gV+>id!abAK3An@NYnZP0MEs#f!(LJ
zLtg8tV33ax&c0sPfUes&P@70=r!e?WQgd^XLZDH-H6LJN-g^O&9YV`_h|%T89X`sM
zpyK!m?TRUH9<cHJqnMSGX^Uyt(AaPX_oq@yX@3R^KoKFb`>}UTD0QYZI5{MHq@k5@
zTz1CS99xz<`Iy>uY)au+skv(?yr@z5Jv1A|Xs)rRtOy9~{O{r7F|iXwIlpnT{xp8y
zj2$z~`;8>Ui+}(JY;6B~f&TxKq{9DCTVGK$4$rqo)pA6mfWMd+HU<V;dwY9UK9tEj
zr_%Pur5LA|PD*^VVfBX<D(X$o2+w3t)mlK!Vcg=48^ui(C;!hqeOKaSI?&n4Y1WPA
z<8x~}Sp;ah#6h6Mi6GRU*r0SUU~BWcI%xDfoUd5p-~{##5o)RIhz|QZK^giFP2*t;
zCudjnh(E1t`fY7&boKQC88hH<3ZDOH4&B_|&S&ALrw?1Twp(uU*k5dPo12~eV8rdq
zYm+t)^p34~Y6vx###{JlvRG|kU|=W81m=Ewwiz8Ay+af|VL~Y0TEYq+lg|+G<DE|j
zjMaaKdwOIfBtB1kdx4;iLSpN*+~VTwd~$kfn-%_da}yg4EjA^k81Q5#<&}d<VX&m6
zq>+&k8K(-}s(-5Hva+&7*0%`3XPlgzfRyL_>Cx|k7w{0<*pvVwtU}C}CAc1i^MIN+
zGb@W&pX#4z`k#NC6&E2`|EGDah~L%h{Cr-$TqIFkVq*8`s2bqOhDymm2yk(8q0k@8
zv5AS15fNWMk|6w5v#B&{a7Jv+EiAlzxOD^gEp>86MIOMlCnhGwOh8IbE@C<O_Meei
zJZg+pNlJ?PpWFLzZx3v5uMBv=_as>$$vHV}R8(Ky_ji879JkL0eA@_9`@l#6nhU_A
z38-;^Ap_X8pR9ntx~{HH$m>Yj2^j^YQw{jCfyhEzKyz!Wi0}G8Q#}_ULQP&Wh>Mf+
z?BwK=sVM|dsXS%@44FGU6O$4kxoWMl93ntPMg7-LT(AO9&F8!k_q{0y2L}h>8$$?2
z`TqU;#?}^~(P9x4OrNks1cm5pL`6iDHZ{$UjlCwX0vJUE1d>X#o*#fSwer`m>8UAp
zE-t_;7_1MNB&({b9&b*&y1O@ymYV-tA>hd-6Y|7rJ-yu5F*Y{-*Dxknp%$?H^Kx_l
z`x=70`_?cq@V$TwS>S;O2O|Jpz><GP;b6K)#acj53)nUPQ|KSjrvmt=jeviIF@QcD
zqy7IJau2yDa_!n(U5Jr>=d(0o;33`IpwFfe_rx0|VHSGhJZPi?u!Qb_fg5mvYuw)5
z{5o{ug1y>_ikjtl7SDY6_WdV$yfS%qNRS=uXP{u($Lk|{Hgi@*d?0c9Zz(*DwOZ9B
z(_ek(M)Ay8H}XmY0-i5Bg-nVJL5@KLLjE;LEoW&sDi5R&Zsm5^xgb+YxOmDxF4DpI
z_-bT!z;in(3X8<7pHPk^&Y`1W+|D242oOmd$$vg1t1_w}c{W%}Kd$|eVO9v_p8vVc
z8t5rcQhf@}D^9}hyg4m*ho=HSyJo+qGl^6m+4U5t{~vJW_?Q`*!rA4C_7msz8`cVL
zFBKMs`zu(Btj%@CPQI&h0Ncv}U@84iS6G}W4c?OB8+(_pN+kS94bn53;3VsVxN$8l
zB-R_1tbQ4mCNO-COsWec^fFUa^2cQqvm)#kN+&W@S?9bO_A=tMXCK6bpfn)^wpyES
z70{awj?8KwEozSS17^Kauc)q+W@`~{vgIR%kOnDDt4w*>UPr8cmuZVfkD#oW@XBn1
ztRlDaXOwr$a|=gF5fMt#(jn#L<<Zfq`yxR_`VJ1T>S{M-<>8L6sPt;t%DZ>SiT^&G
zu@iZ-+KtuN#nC<9n>|;q#6K_c#lI$lsqUOM+ikb?fu1R-BB;B4nW4}sgDr@2_Ym>)
zv+f}qJ$*`Q>i1=KcJ>C(LlQzlonnww*KmLT2YPzc*6oMe^JcH(BKQBNx3_GotJ~H!
zAq2M&G-wFHg1bXP(BSUw6L*IMcXxN2xVvj`nYa^NgZn(=U29eC(tYZ~x#|ZD@4b)K
zp4RTCGralzPvzK2c<g`1Q$MNw7f=1Q@-aO#Gcu~LQoCgaSnoc5d`C$5c>i}tuM}u=
zG&MC9`}l6kZ3S3}A8(FRQxJ@7Fy+aCA^TPQzu~GkEdPtECOnXAXl~|q`kIuONJ&p`
zWp3_vG)H?aD=t1{z0ydI<R=IiZ(MKqLKRvOU?~AV0vO&hcXQ_T0|k!wqk(c}sb9B_
z#sTV<JcUZ7!`yZIOAB*+(q+hpEfCDb0!>e#A09w5MEoO@lbimrtq2YbgpVGwU8)`E
z>G?I|j7WnCOqD5EmYsy!`WsIX{n2pgV4m_*vXL%!vzap_r+52F=N5AYiq^Gh);--a
zWzLs>cYt#KF2H{4-4PHFtaka5g_CTGz}Lh4>qO)uVg}1KZxP(noA1O6md9nu3tgS4
zE3d!?3U#9>OY1(06+8%t4vMF8WP)Duii(e~FMJ+aGBOOz%>P<=_Fz<Ul9WAPpLt>>
zh`DSsA|lWVM}=Wk0A~mYMEO$VSpazFf`Wp}%l5CpQmmozVa@BUw6rt;G626zZ+ClO
zvH)V~a%h`wp|<=Z39gC(UqVAw!B(m<ruYzXBFVF4gDmhZ4Gj!_t<%uc1IyOQrZs^)
z7qDFUs`~i6&NFCD3CQq15d^%2k0#{<>^7ti{jSe|mAc4sJ|Hj{AbejUuDYNuXRGdu
zgF(;9s*hF4dJb(8y+_v9Bc%+%A7g8W*#^aizK0QpEdEV@Q<t&Vkgj~ii8Vu3AlYz>
z^tKo1#Rl5!@EJ7O-L&A9F@ZJ|0C(8k{Uc`YK)Tc@6O)}iGjPa-&YjVgy|LTos&l@k
z)d0dR44;i(Ue>grE!{df*>^d|!^SIevVW8@GRl{e7=8`MuK@UyC}Q%b8;%;;z4R#l
z2P%_xRjHCgM=Nu*Z$_Eb8%5#MA`>QO@X=^Iz5=*p7Ci1zYnw8F!v!1A8^r7Vc)cr?
zT<h=0y|@H@T`P&)Emi8o!dl|tne(EV85yyFu6Mv27_4RhjFHV8!deZtm95_OE~=*<
zA;EtSZNK~r+J4TF1ug{#TuQT9AC`$|WjvmmRwo7)D?X#V+G^?FFg=Y<l1+t9TjTaD
zHJP|Lz{G#`Vq_#OE+Zr(1E#t7Wq8cvr6MsO6PM+Xrx1_%kCfo&O<f1-@!3CjzXtat
zIsV-=3i_vMGys+;2l#>fn3)59X#&fy7Y6^FuE5yz$MFH+CxV281Xzl!v9hpO(9n50
zJ+S_Ri2oQJbx%lG@A9nz=<ONQnD-kbnqZPRyLcDGf$a1fzQqfc`Pl@yVtuOf=nx)`
z4xLSiI}z$(Ta!!~N6`7ywf4I|ITJr-=3xks{!L2T$AC_;gfiq4_=r~)1Gdz?io4Bj
zChG{X>rNVq&XEQrPmv?|!85P*F#&T%VV9gE?}PJ<qDS5fkE`e?`<*cmRgW5Be_^9|
z?(o(VDfE53Y#>m1BrzZx3Ew7SleJo24U}@)lO}~9`UbWBuPbil-~JwJjbW2^`fHD=
z=GfAh4L>)KrBPz=5@b~KJ!c5vuPh=}IA9Ob)RAVBj9=@$sHAkN-}g>@{8eu)TDj1J
zZ<%M~E?k0c;AYOqASGB%Im@=$XNa?pHh)v%pbHQffyOBs48;on$N!eFCc500Ix?JK
z(s6jK^UD=>M(N{iN*8<cLPQm?y&#L|LhKk0onKs5F-YMV3Xy)F{dw?Z_aXkyMCLn{
z#X%@`EgNf{Ls*%QU~sIIqHotJ#3!RGBG&?W$U*f4+P;{wjVkMrm*htEHL^nNe9
z+MzJ|@)qB!kyoEnO<&@Usf<oKY_cyhUhUnVkM?W@J-#U1t)_hs$$II1D1E(jP5Ld<
zVJc$0v-eY26y(c4R3F9=16B_H;@=gTA0ib>!eXAaZ1`@H>0IYOz(-KvH1f7&>QJ%%
zIS+qh4!Al2i?PCLP61(VEi(hq-tk8K^UccOf1%V(CwZ8q^Ld@@bPNnt#C*aM+L}%{
zfk{4d{koi*wbYwzY32l+<Tn*VlVs}T<2O&@pL`3J<r+S$@86v_-HcX#OJk16H(TNN
zv8rJ8^YGPq^Y{U{W^5KHpuKxmF0PY8N9gx(RqLpPy|Ubfmau8g)oH=9i|x-3p9*R$
zTdZ8Mp8{6*i#y`l7C%uyd!5C~Ry$nZ$AHl8CwpwTti1jDSV!&ubfw9-LE^=O>t7A;
zXBU+7+m}=7c&{q-d?+{#7Ad7G`P)IfEmurutuhUG=seLbAFSH(^dgz{d2nxq(D`M5
zq!$|X*S?+PJ0lN;SCKaQQ?FXVCckOHqZ~np)Z)EzEaSm*rM~2jV2OtFVE)?b^dE3L
z=vw(Z2*v2V()Nyj)tYCP_SF?dpPhtd6>B;*HHU(5Zj;VY+GJ@B*)~}V_gX~N$NJ8+
zSl0&)%sWvdg0>aI%iurv%=@05_K<NcV`4@Vs{$cCU6;LjNw3dis?R&6LCWG}Ywa(_
z?^U&3J~7d;p|Y`|V&nPWF8qs<?tuT_Dd}%4>{!R(@H$qut1j#*!Xb<Wm%A?Gd;EMf
zLKzOjUn?lMN|VBG;Mlqb6)^9->srhF|DWOU*J(-4ZE7JfM{pi>+Sz*ZS3lbE0f(1B
zuU#o0Os-22Vs?JToQ4&L8Z@ot$Z^wTl)ZKL{YJm+#w*_`#Zxk{+Dwe^+c@=2&=>bh
zZ|_mxMKj_@J5PvPEeIufSHE+9TjfK|uCIDjDYcI6Su(-$3$QkkIgY%L@20REXJoBs
z$PC^PKu?t^Oj|T6L;uWZbu0;&zUXRp+-31Rb|xalU(<FWs`l!fD16+FcHSeAc?S_N
z?l8+|-{0`UGG~QclqT0J-v+3CTAU`t*K<3MIov}V?<fWu_U#=5A5F6--y*mK_T_Gg
z<2*XGnxzFLAfC^!YjrH#wZ1mb?}`1-N$#_a!|5K*|B&2|%vC|*yi_(vSsPyZF$zwR
zmSGTO2Uj8F!{p*4J}Ru`*U>Ll_x<Ch-!S3){!fUtS=jCdP&C}!abaNXRQ_sAXhwW_
zk^@>GBkkPd`*kYR*rg{W;a@5?)|$t%##&w$D(aN4R<PqYy~ZgWr--g{<50EihZ=f=
zW`!cYAwnAW(lluKe+0H)FXvm2qBO7SfBjO~a$6;`l%zuVPr(z3AZg@@Vp~?G_>`A0
zLiq{*xFLUdSOQ=j0P^+!g{F?4f6rP;bHMYNJcX*@rf%r6jKtj;k^=y5;?}dQ9fGT$
z9=3`k|2WkHxCM}%nJYo-9LfG~gtK?f<g27gX;NAmK;z`*<<0at#>^m!&Cro0vDbuX
zz3psLIkwnHnp6HhG<1#$F3Aje`n8pIp75tBwmqK+2L~U2VR5mfq-61~>EmaTbPG+#
z!&CP{7hI7sp5jk6=M2A8c7;6_WpnqKIQ}o7b_e<YXHXlE3SMo`+u?Gbp)fwKVxpZN
zN~dxR5*5V4%4sMl_*j1wiWZgz-WExSh-oaPntYZl9UfEtjI6Qcuz&W~(=PE;xw&m)
z|3)oyKkJGAeE+IlZ_<oEI~Yj%v$8q?EUh~n9oi3qkIHLx_0ZIu%TvWV^h}DliRv5z
z*_1>UjfG|*%XFA!%Q_W5&BEn<-+yN{p%5t(RnAG)9yv6}B(11wWNLbLa8OcK2E=}c
zPY-XkDg^rtw?fUVB@j-*lH{PB&Ma0*jei1VKw7A-wi}6~?6dM4<!k#f#tG-Y&5S&8
zvoy@V{Tk9l=!KnGZ`LxC$;!{Mk!z-*rW!xur(0(_4n(s+#Jzj6R0p-fY}V#G>`*$V
zBs;!i4ZM}lhxd3zS00t?@-|{(R}>uJP)Sm4cVwA%Wz=UUO<veHC{sMk<Z(I@G)|p{
z;x_p9+-pT%@9@5kWebr_6a$P@Q)A=V)5B*?>o67L14E`3?;gs`HL_2o`!=0>=N0^O
z$2E5D)a>Y$<NUgZ7VN=R6;xloplm#l@;Lmt+#U56%2o%zgzy&Se0Z%0*BumChj{HZ
zH+_)!4Js@9C#MJEY9Mw0Zu`sKfPrRT-Y>iMQrmR-Z(;u2NsT)CFX%zNtlh>eCr<D~
zz9FW{rNk#YI&?u-yP2O#Lu6-fHs<GkjP@^{n-V@fE;^Ks!s}eQ3eehX@Re<?P)Or}
zo*J>Q9{CSODLjLdWRj)Dj^}!k5wof>>F%{vwPgl=GI^;*;gUk$_OroSH-tvgNp5d#
z)c|gOv`R(Y&h_V`ozxJ}G!y-vaZ4H<uPNu9i^xPwv&Y6-nydT*lc+kwmoGnbbVve)
zDJdyOe*b=b^b)QJ@-`|o+*1xL*0R=en6!ICB@&we(e3bPFdZl4CnX~$F3-<rbAuaW
zlM;mAkDj`|VHB<56Gxv+DG&O}et&3}9>Do&FC{419ULDxznKnmuL=_?GB`9uM?>?R
z-?1&U9<w8m5Jq&L)o#$;S&bLtcrP(`e8ntxcL7?^E5n!}Gf~8tQ@;O{@A6w)7nZbW
zpAX;Kt_u4z6BxaRdoWS?A9p=eRDe|{)&2$;KX?jdSb^;owz}@gSlJ^h4m0^!oonn1
z&g}<aCvW5fAo_r6w|jK-ySMiZb-12}&nMskT_x@6)L4ANRzsUb;zLt*r4&{DYwGIi
znwpA>i-W5IosZ{lpcIV9^#{M}c({2*_v41z#4#rcDuT?fA)3a7>k*p9iOP0LY0`tI
z3=9mJnVG4nsrmW&Y89F{OfE~;U&CkF+Y?pKOfB+u+oyXYm|AIQ-T331fzcZ0a@DQ+
z^xo?0v$E3I4oQMRgM9>D=|3ql1x0*z_68SNwiB`m_4V}}b9zejBkMrtbP04UZK{8g
z;V#Aj0}2vc=ax614_+V6Zj}QpQAmnB7;qmkuXLdsm=4hn^|mvA6<N)SO@nL(r3D3v
z0m2po!>K<aCeZcn14S=$RMMzEmPEWkX&YN;6p5&gfPmx!{fQ@g_SHcnbb++N__0H$
z_Tu7*<<Pk~WnSLasVR9%%EGR7qp?O&!n@9#01w?dX3dl~z3ispZR9o6JOtm@><Z2-
zdrE_}v@}kD7)$@h$8pm)J^<2+&xky=YDG3XrGy=G{ob?vJ$;~$++;ouxF=`YoP;D3
zNa4A`9&T;~L_~5K5Ia$q*5zAxZo+TJt6#%e#;IP6B*sC^!21Pe82}y9;2Z&lg+Lqp
zT6A`w{wxN{@H)*eO5E!ux%7~7ZdEWjGcsQ?O`8ficcW-x8(sWJL~K?Yi#pU!D>5i(
zaD3bv@Q3rQ<2nKN`OPeZwe4y4C$`*rVv=6(y1OA47g=>ZRoo(BGg|Ah$NCFFhg69c
zns4{Jps)V6E>w>&!H7|uUTw=nj=)kSOP}OAks;ZndFSo$`$MV#tY|<g#Z?pcV+-A@
zm*T;BIRf43Xfjhg&N=ykCkkNW$j!~2laz#9O<1`oDLl%99h_sL^NBYamZzU;*dmXQ
z0^<j?kFQA=j10Io;^Ic$!M?{N_9{WV-vG`MoNlvbD|L><X1H);D_mX9TBklLD<bL-
zQsU=VI?l21^LT=UwxG3k7P_}mFML&kv>$zvX`_Ay@^w}^I?LLfZ_52XD$5YCS|65?
z@pGN);*E<}$Kd#)8`Nmgv`Emm$0RFZQD!G`zA81s+n_^wBG+%RCmX5<j<nT&Vsk!H
zM%%+w-AqT?^n^gbs>qHGaX%sY%u8o0f2z6;GmoMT<D9%UK`jNvU3cf@gA)_7imF6E
z;}q|B!*^^sR#%SG;#{Tpf;arqG_Ly@@cC+ddDglXKwe&6fq{WLUO#<h0i6rLt@B=f
z?McsylVHb0neS$Pd4F5m<Lo{VhHGrl09#O0)XQa@Z;PlR#9PueRHxtCx?==XeeXd-
z_<BirweF$0r0fsvJf>~+@$w>L>z(<S_j#yse$*jk&ksU1%K<|TLV)rF3?-wN`FjO5
zwd9zXp96-#yw%as5m;T5%tefYQNFqeiM3aDwrxLfYZQATO?*1Ht!OkF!ieLA2Q+lR
zI`Lcnw3Cw)m+d-0`ag(Ve>G$)he#8D`AAg?_pMD7kzRgY<ApIdNk?4Vf@fCfEcv8?
z&<nR)#l{QMj=sL3p{IBga7P9F>VUKzP^-_*0z?pxCOFY7zUP!!n3u%#pA@5REQR{N
zllTJC1Z{tz3g~m$ZYwHx1^cw;Kh=nOdOrRB{d**8=%721Q&6z8ql2VV#?{FwK#qoG
zqqmcvp2o3)AUG$@3`=c!jUmX#*(othYU!>~cEoaE1kv{nGB>Dc-ON2p*&|zSD|eQH
z==&;E&iNOZx|?(wfVp918lbCBPfr7i|7X_9+w{h<_@WiYKlz?VZz0;d;Y^j5^aptv
z_6^t9PJprmc;CXp!rnVo*VM?2H52lVC;4jWQ~FuAX`SdChH8byyd8p|prFiGXsPMw
zj1LUJn;i{=<72;0VHtJGuhk(mxarM!iW?TuM7e9DCtK4|`A`uJ<?KM6>D^u9+yRv~
zEbKR+5Z*sLY=r<Nj&~2D(wDk|11q>R7`v7%2CBmBH*(P>8cie}!wzCkiivTqJ|Xs#
zHxW;1#%9%B-89^GPd|q|miuIyoq~v;dm1>{>s>J%#LbpNAzGc8za5px*g>HMd1i(t
z1l#p-*^oh&O5;I*b_aYgwp39wRQ&z@i)D^G^{9EkJ-!uaGn}=NWx=gr__%MNu7&uD
zkSomu{f4{qjVkSy+H)OvL%+SRoC5qgh6F1T19itLi$1Z)+C;%~<+p(01yF(LLw;P)
z8949SLY<IA%KqIw4R-elV>cr8HZy9}^emb75`c=~Y{br;jK2TFlHOWUf+Xk)SOEhD
zzCesSHYW0R5C0w$7H?TxZ*rrAX*`HgI&3t(YvZYrmaJ@S12p?fKI24H2kWt_DlblJ
zesR5{q|PlTw7L{AX@n1h^!Zn3bD7<_&x|0a<3|g%tkEAr6G9h`mvzf-1Bqki78Y3V
z-<Oq_&p{wJcX!1_R2ls5>oxCcGfVePT%J154{L>I>`v{tdz>8i(OH`a5&VDEeFq?6
z0CmpK%kwz@@($aO!wcf?e6*jHcZe%1D!9b&`ssWeh?CyE?dkEoJxK+qVnf62oCk~p
zyB&Dm&W)HWS3$;>eC9`-ff~)o>T9^HZ?kQzs{s~sr_^Nhc&MZFKwU;v6|WOBzZmMj
z{-vaWo`JTb0Jev!R`#3Qyj=25`VZYglLSurI)?ZJ+%1AH9>A0b!^^c3fi4iEE1`-D
zK|}>n6Ha{Y;RxwmvR}T}Lk>)9C$j|AwT#Uh?8E6a41gy%JOo^m-#7c+R-|DYad`Cw
zzZtWFb?H5iGUXtUuH46Hp*xHO7I)MZ1zfUbxwr-hV9D(BcYA=xaj)CZ$i(F3=HXro
zJ=s~wvQUa8OGSWtuWY`V>17c!u6B7moeLNrP(X$@F*5^9Z_pn-&&|JU^Won0B(z(C
z#XHuA%sI4QKZN4Cq;jWmsG-JVGJX#B`~%3Q*#e#}Kz+`x@MWj)BQ)V`wq|M@S-{`|
zv3Ey7IUoKMYNUO7ejnc)M+&TI7uK|dk)-^N7N5!yARbyr@KVA32iS@aa$wPpYJ%69
zAkcSfG%iuESB`Oi7l=h|<1$c(1OrAD-@iB1*FX9Qo~c?&EmgM8?$Ch_U7)VP0aW+%
z5e4N4T+=}PZ-0Bc7O+?)=lc026X)ShKvaognJD}CGnb|zaXO^>1NDHa-Y^h+WRH8e
z`(@K#B2iPD0Rp<deoklSYXH>EvzaA1ip=1mn9Uvf_zptdgez_%3<r*!$nH=4ytTJy
zYGfoODf!{z;jMZF<x58Y#GMv}Hzw2z>0R=Eb#FIZ1I1?}vJHvE5`oeAv1e@?zPT$Q
z2Oyge0&HU5n5hsFrgdgV*Fkd1g7MxIXE(7*w28G?o24Jb8pPG6H8#2c!Go;qM8t=_
zdiC5iPeQMie61KZE|1-yl!*70EE*Mu(|F&k&nMqyj@_&lz!}$8hX9f=5Eif2lU!6r
zhvpPpSf-C|)F~n`jwlXlJ&wSwDUim&sS+g$h0#g+^=$z;B9bM|^%iD%(0D7o`5%qd
zmoLZD6+u;@8wM_$Jqf2(eYa4LUviV4;&3rR>+r>@QC`76t59Y+_^}ig1RMiaD*y{-
zZAd9X!ws|$QtfP;Rp_l%P*!_jhS9s1Imgm0zuGW7%=T2@-#@uUNed?TU<}an|NVU?
zsai^Y>4rk2&k`3pui(;uHyPOjN|Dx*EUil=k>Jb^e(2;iO0{Q6y%wyL<)9@|mTw4u
zf482H7)AEJ;(*LrL6%v<Nkd2z{S^*uAhfQET^k`@XX|r6A?`UZ)N)Q?0)A1@9tjhw
zCgj~hD&Eb1Jc}0_XoE2AA6(|1Z|9t!V|!aMwu$-<Iug3$XT4G~m;81`y<2ihD9}kA
zDw-z5l;0U0t>x>uVm7vI*t%C!wVb$JcDYt{gZF4X*kZ4;9&MdOQ6_Vg-_*ByL2GH-
zV%+PePySm)&H;|Z=zMMrD^c-sTu|=${+Fq;{KHm5u80<Tytl6I<J_p~+n2Jds;rgg
z{A4!;{!`m3kahdL#_HF0N=$>ZV}oXW<+k^>4M{oHiY|x1w;bOfoW5V5G%uPoeuzeR
z$WC*-q-^OrjQignkwvSbWB9Op{)HC3qr;Cz%tuG&0oRJ{z(+7(Ik2y>iPNX@i{~BV
zj<t%Vs}N<}Qa|m8c&z;B@^7-!b?4Ql_rW=P%O2xv8t&^QJ6PxETEn_-4?1tut=~>X
zM}09Dz0SfXWQ3>VA~F)4pHX@Y+LzL(pAd8_;UEwHfa&*Kj-b}O8t$M-t&ccO&x-7J
zRw{6^24k=4Pxu~VxQuVt^<xnViIj)C0ml-5u^39d+HU*P&(tR2jQC$E<In;`2jY*`
zU}v_$<~ZDbE%fw#YcG})XZwd@^1a!pw{f~Xe20z5*nsEd#jom&_RBQ9)heNxu|I#X
z;?{4w#oP<Th?L3fe-Ae=*W;*LyV>n_mWl1ZPeV$^e$NqTk1&q8+CE-f$!AHQR$im;
zP*146gqq1&zLY%$llzL@fDzW&?sGt`5?W0-cYW!X)OuQXrc+TX_k*7nC4)2Tx?Qj0
z%#fpc8Oq{21;@klOcUuoOG(`)P}KV9Yr>Jzn8%zh<$BijdQBoc#cWrpFvDkD1H4A#
z37b!HU}rfZ_j=FMqP9Sx)|lDm8xsFjS0r>&L!xm|O59gEZ`A0M^LKGUmhozF(OCEg
zp2T`NWmDK{X<U3MOqp{Ma9|lKW1|M4ZRj|tV{;lE!cd}7ms%9pt;qcc#6e({y1@xK
z?WC+;l$WM!)OF(RvWArVtTj`pqk|@pxs9BMX>aH%Q6<~HUYRC9Wnn+5lEKQq?FPjo
zS1%W^#h&+OxeYIZnD38Lb{@5C+RKNW$z$f|N4ZF9_*!L5Xs;&4m4j*t=QLlhUpbyo
zvn=$gFPh{d=v>y(9j0_71}(W^22rY)jo*}W9>RcQIa4sPT=1TV;{|Wp^AJMwjPNz8
zDhg}GMD#x}4NAG6;x2+71wS-~OKo&b%E)=)5XS2+g`F=Y1<5!pa5SL?J>ziL5;F7t
z=l0;r#SrTMAZXXL9S|mNEV`B{PhMqz-(~s?iJ&}gB5e>bpqdbVv)27H6hYmZ6;?If
z=_h0uzNCfwLBU+aSlVhPJ({68*O=4$%)xyZ9hAmPw7_7JTORxq#s~&DKtZ$12bqq}
zL<`v2zyXMb<8_h@9(=2pfxNuGzn`j&9vT%@1+M{5RtC20obxJ$u8U6o9-IJ@gPsv;
z`fgYcB3R%?!?G$F8%t?>2TW>FSsD8a3rfxwr>7Q&`$bx-UKTq%U0+w4CSRVZ+CJ_G
z+bG1w1Oa^zuw-N##(-UY%EzA&s(2tfh4X$f!z7gm3z2rwcx5>t6T%rABT(1K?jMqK
z(GdGYODn1HjeKNCTvphycJwm(*T-SNQt3G?%&)h11nBEvwub>miQPI23nvB!#(GJ~
zu;g|4iexqh8WlfF4YpraVS$~aqnwMqm_c}|2H;J+6@=c7iZ2UX?8D;3gt(+6{O)dz
zy*<!R6Aw83>u<xTrW@<m2zcF3ZA?~`*2Ef4zyldLh_)2)_pb#)W#Fa|1>u2W>|IC4
z@zD{@9*(?{Qg}$1CK*3aCud$VfrX*M(f<eChLOkU{%4;Im>lAtK&u4M)x`q?2KLB+
z`RM?o`p-rhFuO42Koc1Z%x_rre;{v|1i0#dcGQ5OLHu8M0v!0WQKjNdGO?txI{C#%
zzq#gNajfa=!ZQS|E5J#%aNexRW^P!^R&$VfX#i20W@_Pp!sE#=EdbM9sQP^_k%^^~
zvzXyrzBpF&-=OFY8TG*m<&@>;xqG1hWr!)IJgd!ImNIw|Mgm2jBMepz*R1IZ6uE9q
zP4Oo`f2i_)aBt4ph1^_q(X_J>JBM^DeG!u!v2pdJzIoqxSiqzf6OP${-P1;Y1#fm*
zw8ITLdxv3Z9=uk4Gay6v#D_KcBh6k1Y1!GeEbAnL17r6WVH}B7MlW`}u8LPMu&6jc
zTl%9@Nz#Z6A$;|K0V^PooUcVX+psK(w;eMaSSM{sF-kJjNuGc11u@HeZ?m8u%i$Sx
z7KN9F-V4CM|AcNDwfo4ahfi(gEY{Vsi7mBK-29AbH$t{~E*Hz;=?@X#=&H0-tl(_?
z_XDE9lwqt<*(zAY#W)#0oMRBxci|oH*|BpdsD5{{oYD}T$eiSR2>QeZs>MWRHu#G4
zR-u@vteJes@q))?wIXPgm$LOz8^Z=TqoUPC;%o9wV3d^}${ADwTc4vAQSGGm+wWG>
zp)0Qafu~!tyq5+uYYa>3Wza>|<wRwYoz~{<<<{Oqx8={*uZb8AtMk|f4a8dZHEbFM
z`VG0=e{_{pFEl=6Ix7&XN3z4JQaE0aX+!#NJLB552x^>Vg>WO+Ku9HnmK4yah0Cne
zo<U;ZRGlzjXT)6E!<>la@!ml3+1g(a30!@S%c7f_z)gf`yvV1*Trs$!LiB1)aY%aq
zT!!3+1y8^(F6AD2JpGY3d$Lt%YRj?x?P<fTIMn@QA>!Up%p`R{8OdoE9FUMPd0=s+
zN>hn#M#k@qzLA2TEd{0-{T^B2j!p7ZnJS~H7T7h}E@frpe%4mLI^*$~uB*Tghl>Jo
z0DmAgl*}cELXd!aZ;mA|W$$41!vK4nzvkwL;YpwLuO6>)_pCG>-IcJ%=u9nXQx39G
z`S>?tj9i_i7Z#e~Ibr=)IB!5tDR{{VTG43~^{OfkJ(KUc$+~`c#X?)0za%I8_V}jy
z$#JVI1b<F5HZ?02nj0IIC@yV^i)sCIXCWQ&f;y%9z}jH?EAQ>lW>u3~yR`H;7?f+O
z)u}4^#$Bzm3SuXdSjchOu>{8x7-%_tat1fZshqHohs*=d<4m>Cs{vQ`rt{&ushI3Y
z@DbkU**Ccy4%=6UPqtsnA+IGcM&QpcMsOVM?N&L?UrO=h8jQvD4+I6jyp6ND{UnRW
zoE%UOqe@|xqiM^ado@hP8dVsodWFDV>7#Y3@-t_{1<Y3pAraUol<Z@)-aT=)pjg;^
zMp5CRrC&;Bwf6V?*00*Qm1cRLRJ$KdDu{-HAvgAvd!}ZOXVK)GeGFcnSbver>^e&+
zo-A#s^X!a%Nc({Q3^WwcP`!ZL)jIfwL}faF^y!{)j(E0O(Kgb?z0r7s&(xWr#oyoT
ztiGsyuKrRLLDyOfbHN)dPPT1X<%acv5DS;&hPwi%V<Ml*&SWgM0&#W2r?02*TONVh
z$z60y8GqY`kDUs#8YY>Ghh*Ci(QoFaMyiI|+bdOQ+v(PoI?EIz%Wa1tyFR`aJ|Gbh
zK(INPJ>i}3FV)~wUJt&@Xw6pGz=r$$qKMiRutHb|QX(#Am<;~oDo1(}#lO-@SA+Ew
zD4tgNTD2VrMrv3$CJ~zLo{^6ztYDtMadE=^>~*iwyCu5DL#URGkN*ds0O$Qxt>4I}
z&S8yCTg~7jJzlMIp6ofwqIy}cZ))xiPMlXBudK=4?GxFm$27i2t{V@Ri6JvljG(OV
zla5uDes!Kws?}y9iV4a4b!8Kbpx1n+TE>d!#B07EezIpZf5Es*ol=g6*S5AVwyQp2
zxJ{jR7wrz&J_v>toJnsYD&o*S5P6MsFqlqQ3_Oq#*@sv-HoNahfB#qlUm@JyNDuQu
z_!+*M@B<vg-GY%NT9|1a%I<sdXD3^AM<GG{59GqUENr5%(w&Hu;H49JBQWE49Uc3)
zWT46BO^A#@we)(8RoP5t96`82{e0Y>3oSPX&&LaA5>iR(+|rAV%^{EgrNLlPM)4MT
zH0tI{K0)CGH8m-AQK?M3y8tg8N4HQ1Ms!M^p{I?e!y`YINV~Y(w-xrpHnP0NageV`
zRXJAoDY2)xt%0lQDUAyxejQsSSYOr74a;nR%V7nfY~1Vl6ZQ1g=6Jnnh$&<~Q;7}r
zK0;bq#FRs@&3Suf=L<2UcP;uVW8w8jbn|<rzMHmB#c2pa9r<G1{qywV0=wU%kBcr+
zg(U1|x4%Xxo}Hjx)rO-8%aS60>C4@t{SXel?`@aW69o+jml3PcNj14vlmxEmVC0{}
zOV4hY6*XINULXDqT$}$^$Q^Xk<Y?rIo{X65i_7Fylb?4V8`NQ8I}v_!R7PoE)%<uo
zZaXW-P$<maVb_AAt4;_spuM_=ggMFyfxn?kavnDjr_XuEwWD!+x*$S|GZu4P9%PPx
zroPCm6J9lnH7dQtq2wC2meMcY6!@1_L2?EUG7U9JRs0%L4;woKwogZrdXF=Nst=y_
zACBu@(MD1{K8DhDr+gWas5$(PK+GV3;PFusU!Q}uhysLKT;$5o5b^zR_QikC^<8;T
zj)w7e_DL>V)!q9O8YeF0Ra}Kg@u>@9iQk=_reEt|%UU|39Bto6jeNXN`gw+w8MC<m
zlw8@N5tMJJ<oE+=wILzKd%3QvtU@4Z6)BfMW$lup=>~1m;NNO|PcFe*mnZ7=Rc2I3
z9F&H}J8aC9(#Gqu6ZgljUcm=ggsw|M&X<ylG`hfxPMPm09UP%4YC67Z7Fh$>XkiMg
z%R<j_te&>f3Kh|u8}lJlpBJ(Mgw0yk#{x#%=&4Pv;?`*nUf`^Y1QB0%e^pj%s0;Xs
z-b}2B^EH=<(Q*-U{iiDRjsZGx@tHc2CxEg2cF~&u=&tN9jo6`MXbtD501VEO%cM6O
zuYWLFb-5y)JfU}gdz1ANxA-1yUOJ9S7^J(RE-H)G&7RE6lAvRQ)|fZ@)+ivco6%Ky
zoq(3+vlk5OE^S|b!2uDW#?3XmCX|gy5DCkk_EDCe!b{;BYUns|t50M}r>Z6D)D|<!
zi!6iFrSqVMFxIkH!(K`NRG$OA>RJ`5fjwm0z=ETXmT26(2!#Le-Jn0Ho%eYO-G7~k
z{iC1@<dr^tkP-ylpYjd+Vm+s^)vl)LL5ZH0Xatatf8f64D%sXfB4I+0n@?u*w#^*v
zW*c5W-#Kp{oVBAm0-C+4@K{t{P3`?mnymi@?yDhZdaX#(dFOwA_`2bOay_6E>AUYp
z+cbkzm)a%tie=KNzVAaKGEgJX`dm445FM2NQm-D_H$At#@2E?<|HLC4Fj|-Vu4c4f
zX3g&0axRIKr%}viYxc78!4}EFND&ihm@jF&1pi#>w`)Qct7401cV~?Tx%;6&zcIn{
zfZXxIDXg-$l)+Y~F})15_an0$Jsx=dCE|}@N}Q|pjf4@4FBy~eJZ)^^{^j9(hE!CY
z#_KFt4~{A5f^3aWVy#Ogq%)N$*;7n-j4^!abNiT9hwWOoZ#(0VBTa*24g{~va~>`3
zM>9)sl&x>+W-Zgm@653bi@j_YSBjn_67nD`4_Jo!GE%+#Cf3)ouSRu*HSixh<IwPl
zfH(z4J<^Mu9k<mqEkNbL9F4H-gAazWRP~mj|4-X@9Fw^Tt6;cGIx72x_mm4|l{{J$
zmS?3Hz>BxiV)@Iq#Er1_v+CQUw<BgxV`fR}V6MttTQYwyzOFih)3(e$j&pX9IF+}O
zNqiMvvgF&esL)WKVVrMuk7LL3;nVv@7@5@wax2FnE1l}&`GgH~R18mki$AmbKc8sl
zou~#D@A1W@d29H&1UTFUZAM03yi@&N|6o4G9^Q{uPk0M4AV0Je)j8QyAmdLEXT1>$
zQS)4N5#)&e5)!6hH%l8V<+;lpG)3dp9iF)MXkw#17`h8)`a`j6lY~xCM&f|+N4?<9
z`J1({B9jzZ8lKzZm3L2>dp761=8rGSZ?&vbS~9=)Bfyr6kpe@MiKJ!)uWSwc4I>^P
zDhRJf>`&w1CaY-%WxjE|F28t($gFiKznXsV33~0@`?|K}>+J^A<f2{amp64Iv-3_~
zDP4%JkTeL>eMH>GA@8Hs{th(liVt7i|0Wn}W9t0&A}BJ$*rsnQ!mYmjetwcV)FNrA
zzy!UP@ET$wu)i*X=H~-6J{gG3LgzrIcO2@{!PT0RJ$S-(P5fmStIRV2v0YJWcRkAR
z)`?u5dO@ecA4>_~ucLI@n)U3zLC?`9R^=P&wm*T3)p@PET74KUd<`42g|ss9Ha)ob
z5w~O0GYvc`T5#3TFPMPPS8r1lw<+$pxKiUqjM9gt7cidPlp%inH7=n#dm#tc73yv}
zP_=oT8@@!-+@ofBuSeDcY95RK1fJO6KHyxhLSNzc4xnG-`=ZDfsUK_5^RHqzP0!8K
zJ02ZG5IPkxmOV0RvRRX2|Emr)L_55hRc+vx?$=<e?I~0%F=+++s!$fQPXb^pA|~Mn
zLD3b!nSr5&aD{K$c4;qMm&-Mb`E^;}Zf~$dpi}$OP|zkVNKPlfzDw^OU$t+c!->y%
zl_ha~bM=Novwhan^JDri2n_T+Fxq$bhv(5cx|uL<f{4_JeuGG&rS~n?pz{JQN%msR
z(1uX=cJFC~Xi&fKX#b^FEyc^@h3c_P<h%B8Kdo9NvEi8jL7}-AnPS1O&Z*O(fg6bR
z5OX|&!zyxO`dI4*rin1^B)OVlTFXqq!SfH?Lv<9r##y)iwjsd05n5Nu{NbTW!Kv&0
zX(46H$oIZp>$e%2S8jfa36p@ggL=sA!Wn|GO|Lw8l-Rc6dbkn4H<`hcCAh(CgL9sj
zAd>yU*GI1S4w+e>qnjJ1Tx^hiZKJuHJ}*&hz4Vg-(+acW$KYDc_+rALMss>IsD@rg
zJ>o(2hrq_^VcrC)^K_x-=xBWo^|Y&oGcIPrx5{r1oP{{~t&(0J5JBtyleLwS7{9OO
zRfY#>Ci>b$#XTny7Xt#l-6iDrJXkOWKsd^0V1dmwJ{esQ@!mAab=lc&>|ULGff+~M
zj;PKPYlE!1*iSoM&C8cl7g9S9jbQOwLP7eQ*~3g67G=zW_})D45rVfnV&}EhKIiEV
zQeXo;GW^8Ogau0XhJ&Ttz<?YezaNAJe%%OqUZtLzV)*B=_)E!Z7<cW<`e1{svn?Ih
zVzI?r=hlj;pLBI*Kkj4e8MFLpTPS4bh}J=i3*v!=l?m0>;q-QDhll-y>li}Wm|s~X
zblhUyq^q+H`7Uh_Ps+K8eZQN|nZ$?4sxK~ZezIiTcm1>!@g3#N)>!14ZcOmCp-GVS
z_+c0Rddvc-ePNL>X(v@gHI4P9dn0UU##N|RIx>odWsDrT8>6#aTg9Ht8gB2`@?o*z
zWY%uq+P&CRyr2~2jTR5JAz{)RZ8tJ0`fJ^0wC#jHMqG0|ncG=~iYWbkDN79y`{<LF
zwx29fmu(=9cvwgDwqa_RCyDaxi2{tJ^Rq@d&cM+Bpk`N_WCH=`?$@7oB1L@}teLm!
zkr_R^**cVl>5>)Oue8fvo1VnJS`0f4nflqOFE`{=qv*n^cLRc5j6>XZ$s`I|E$kgf
z_-K%bW9Z6Tu#A#3l-PRW`3X}7@Ezvpm^#*;$*0G|S`SmQlNG*K7Td=$xl3ELmVR-G
zcXYY=veE=dkM*eR6HnY)zwNG{L6!>rxT7!R+D(eu5VOcmr|&D?=&E(YQLxCz`&QDL
z@X2y=NTCE}$efAH*G~c2jQbai1{>QO7Htd$O)<4v@z)J=_i!DfHPP~CHu^Hi6)l?V
z!ypD-P+gOlk$F<~gqsnR@6wJ>;p?6UGW!aBUayq32>8Xh-A<#p|E5dF$5s6A-J<$r
zPKH`=kequ83)_~ltwzqFPVk>aJe4!q6x#qMSIsU%y_q}4FPsf4a{df%$cZp|r<Na0
z@f7`wv1*=f2sqv(ThcM25QPM7JRYW7Dhj+Agz)oOTG<_1FyQ?Y1e{bV-sKv`-CmTt
zmi8;z6Cm!cE*zDXFUsZOT_UXSuDxVFFh$v8%MP3yXly((_bHT%Il&GpO9q9?Ov0SA
z@&qe}C_MEHz570R2H$!#EnOa=Z;z$;Mm@6p*B6}x_&X{Vt4R94@*nJ8jJ8{gC43GI
z69h@HT^u-24QSh(5S2*U6gBq6@1)%kwNR7YQ6^{t+c_;h><@^w<5H?uU`vh`x*v9q
zl9uwUr8n)aaj2|3xjo`_QM<DH$_(3vFAWykpR;A9?Qhnd{30<sJv75fJk{fH#Y`hF
zu4UZQ8w-2XRZ&&^UbRgOo$||0xl7r0I8_Z>?m8~etreo3{Pwu7rb!{~UVpyum3=g-
zYmAra#|7$Kf`_Wou@z_s&0=M^IzCPyTzU&k!H~%Lppfbz&SRY+E-KPu;EQ;~z=>jW
zl2&x0iq+;lf8Kljd07?*{Ozm#&vho+hPT$5^|aNwmdlTPWQGiT5iv+mL0th`NBf+{
zsL-yD-_=FHJEtQMMXrk{vSOX2GZm88K4+fSzEXm>nSPGb@oPdWKNjy~2`w@CO9v=Q
z%tEaj>!;Lvmj7<GoOcu_qPIM#aDP7P5{+}1L7+ZA#XD_0NmT)#u=;!~F$+7eR@pgw
zOu)(pJdb*CSyZEzGD|*4aY$ctoz_q9+9FTwtPww>3oF)?GcV@Wl;nTrs0q*s5;&C=
zffr%2a*(z)U2}TTN-_*V=uKj$o7^uV*(WjKkk3Q?Dp2%;RoW`YLr+I{wtBGguy;>Q
zmUg=RCOp&SPmcw%$J?JyfD+-joUIl9h1<Y|pVlsktXW#Zao*q2%;r&Oq*Sy9H4K^Y
z9#c<C@zVP>2+mB-8rO{E3;4F=XmuYwi-)}Egkr<@)e0=5#_#3FGDGgYJOgZWDEdJ(
zvz=GGN^iNYwdq$N6vy!}Vze(L;5q0F{(Wry$#z<rrHL~GgRyV1a|z4KM6UNfu#fJ*
zJG?SuTo!(Zo^^*!>FeB|M~d1Lx?QK(N^>at?w?7L%R4)lvXp|@wkNYYwh3ci8=hPh
z&{ls8wqax~GYJL1Ys{-29XdVh$8mND`8`6Pshj>#W|*`b8(u7v#Fi_@#MSv_t5Hfn
zCHwrU>@4+a&awZR(|T^jT^*OmYo3ys{o|2$lT!>kaa@oXZ~E;CRz<t#I)sVlkIJ2s
z$mj8l#K)kcWxgo$OEl>v=oj7Q^4S@p?^+g9NQ^ut8e8y6$85dNdeXBkdp^4U@q~;}
z?BbPNIU_%#O+Od1?y+4$k1t1B*3u##XOYX7q>JjA7hbO)&~K!(Gdk5@npZ|>UE+#=
z!_QmKu`#pG#cY05Zi@19bzJlRHBceIXNbqn?YLm^go79gVO-=m(j+_r5#fmJP3OP6
zkM4Q8c4;x?+-!@XX>DQMGb5QGV4C3lTy~-!@J+s^oPX0%?<u`$GQT;d<Rt6ngF=Lq
z$_;1v>_(GTmUX~y2@%*Nxe+_c#yH}IGc;0i&I7JwWr%FiT#xuSK5<ahvHvI`YWF>%
z1rTtaeT}SXQ?rRkxH#{EQO-u6YA_ivQPCr0cB*T>Pre!_UB|HwZ3|R>(_Go%VKu<F
zoVaWYfOwuZ9%7<=FT|uhxZxVFC!dBml8jB0Av>w~j5@!^Cz!EP-z>=AGFH_$W+ny3
z`wl2Jmil%LAshd3GtknqLt{I5w^$Sn?>>;da)^(_xWAtwn6&V5Q%kVjQ%%`Fi#J$K
z#Aypnd8pV=4vl|R`uhpohNe!8j=Yfbv_o#7w1D+(+2wh8I6?=o^WilMAGpNeNLaz1
zRG8xzVYuS#XskB)^FzweeO!<bO}7y2ihSJPO5(m&tHKNZ84m@F96+{5$>G^e@w2Jb
zTy@mKGHg8#f{XXGued(dh$`(d)z05f#@hP|Y}Vaf0K6kCeW^2R#k<WLH!t0=kiwtA
zZ$KVCh8+fBieFDsIU@SK{#6!;`P2<c5Y}ppuW`Rd8TyHBDdtPP1d~&gjL)S}G`exg
z^I7sFIap_n04N+#VF7pW=8ksRW`iVSz#sL85tf!zZAS}Y;s7p@XcP;Ym3|z<15T#U
zwneZH0`Wgbzyn9X2{*Y;O-EN8&iy(|`N0)re{=i0X)?~Qa*3NC+(!{Jdw*>$M<{i1
zQT1-i_@fUsa2zv`QNRwODBp2z|9yfDXU!wHU$Di=Mo+VXfzfw>{r|DP0o=_0iVDjs
ZqAn#(9&-{s@O5BdBt>OKDuwm^{s$pBT$2C*

literal 0
HcmV?d00001

diff --git a/example/ck_tile/02_layernorm2d/script/perf_test.sh b/example/ck_tile/02_layernorm2d/script/perf_test.sh
index bfb7f9ffe..a34624536 100755
--- a/example/ck_tile/02_layernorm2d/script/perf_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/perf_test.sh
@@ -2,37 +2,37 @@
 # run from top of ck folder
 EXE=build/bin/tile_example_layernorm2d_fwd
 
-$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
 
-$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec=fp16 -repeat=1000
-$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
\ No newline at end of file
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
\ No newline at end of file
diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
index dcd40fda4..d56406b6f 100755
--- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
@@ -2,30 +2,34 @@
 # call from top of CK folder
 EXE=./build/bin/tile_example_layernorm2d_fwd
 
+for fquant in "" "-fquant=1 -prec_o=int8"; do
 for pr_i in "fp16" "bf16" ; do
-$EXE -prec=$pr_i -m=99  -n=13
-$EXE -prec=$pr_i -m=17  -n=16
-$EXE -prec=$pr_i -m=1   -n=100
-$EXE -prec=$pr_i -m=4   -n=128
-$EXE -prec=$pr_i -m=80  -n=127
-$EXE -prec=$pr_i -m=22  -n=255 -stride=256
-$EXE -prec=$pr_i -m=7   -n=599
-$EXE -prec=$pr_i -m=19  -n=512
-$EXE -prec=$pr_i -m=33  -n=313 -stride=1000
-$EXE -prec=$pr_i -m=11  -n=510
-$EXE -prec=$pr_i -m=171 -n=676 -stride=818
-$EXE -prec=$pr_i -m=91  -n=636
-$EXE -prec=$pr_i -m=12  -n=768 -stride=800
-$EXE -prec=$pr_i -m=100 -n=766 -stride=812
-$EXE -prec=$pr_i -m=31  -n=1024
-$EXE -prec=$pr_i -m=64  -n=1000 -stride=1004
-$EXE -prec=$pr_i -m=8   -n=1501
-$EXE -prec=$pr_i -m=3   -n=1826
-$EXE -prec=$pr_i -m=5   -n=2040
-$EXE -prec=$pr_i -m=7   -n=2734
-$EXE -prec=$pr_i -m=1   -n=3182
-$EXE -prec=$pr_i -m=9   -n=4096
-$EXE -prec=$pr_i -m=3   -n=8192
-$EXE -prec=$pr_i -m=1   -n=10547
-$EXE -prec=$pr_i -m=3   -n=17134
+for fadd in "0" "1"; do
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=99  -n=13
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=17  -n=16
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=100
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=4   -n=128
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=80  -n=127
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=22  -n=255 -stride=256
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=599
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=19  -n=512
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=33  -n=313 -stride=1000
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=11  -n=510
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=171 -n=676 -stride=818
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=91  -n=636
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=12  -n=768 -stride=800
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=100 -n=766 -stride=812
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=31  -n=1024
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=64  -n=1000 -stride=1004
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=8   -n=1501
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=1826
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=5   -n=2040
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=2734
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=3182
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9   -n=4096
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=8192
+#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
+#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=17134
+done
+done
 done
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 2c423831e..3b198502d 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -25,6 +25,7 @@
 #include "ck_tile/core/numeric/bfloat16.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/int8.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/math.hpp"
diff --git a/include/ck_tile/core/numeric/int8.hpp b/include/ck_tile/core/numeric/int8.hpp
new file mode 100644
index 000000000..9ca3333c3
--- /dev/null
+++ b/include/ck_tile/core/numeric/int8.hpp
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/numeric/numeric.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/random.hpp"
+#include <stdint.h>
+#include <type_traits>
+
+#pragma once
+
+namespace ck_tile {
+
+// use int8_t directly for int8 arithemetic
+// here one can use ck_tile::int8_t to access original int8_t
+using int8_t = int8_t;
+
+// limits
+template <class T>
+struct numeric;
+
+template <>
+struct numeric<int8_t>
+{
+    // minimum finite value, or minimum positive normalized value for float
+    CK_TILE_HOST_DEVICE static constexpr int8_t min() { return int8_t(-128); }
+
+    // minumum finite value
+    CK_TILE_HOST_DEVICE static constexpr int8_t lowest() { return int8_t(-128); }
+
+    // maximum finite value
+    CK_TILE_HOST_DEVICE static constexpr int8_t max() { return int8_t(127); }
+
+    // difference between 1.0 and next value representable by float
+    CK_TILE_HOST_DEVICE static constexpr int8_t epsilon()
+    {
+        return 1; // not used
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr int8_t round_error()
+    {
+        return 1; // not used
+    }
+
+    // positive infinity value
+    CK_TILE_HOST_DEVICE static constexpr int8_t infinity()
+    {
+        return 1; // not used
+    }
+
+    // quiet NaN
+    CK_TILE_HOST_DEVICE static constexpr int8_t quiet_NaN()
+    {
+        return 1; // not used
+    }
+
+    // signaling NaN
+    CK_TILE_HOST_DEVICE static constexpr int8_t signaling_NaN()
+    {
+        return 1; // not used
+    }
+
+    // smallest positive subnormal value
+    CK_TILE_HOST_DEVICE static constexpr int8_t denorm_min()
+    {
+        return 1; // not used
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr int8_t zero() { return 0; }
+};
+
+#if 0
+template <typename T>
+struct numeric_traits;
+
+template <>
+struct numeric_traits<int8_t>
+{
+    static constexpr int exp            = 5;
+    static constexpr int mant           = 10;
+    static constexpr int bias           = 15;
+    static constexpr uint16_t nan_mask  = 0x7C00;
+    static constexpr uint16_t head_mask = 0xFC00;
+    static constexpr uint16_t mant_mask = 0x3FF;
+    static constexpr uint16_t exp_mask  = 0x1F;
+    static constexpr uint32_t Inf       = 0x7C00;
+    static constexpr uint32_t NegInf    = 0xFC00;
+    static constexpr uint32_t NaN       = 0x7C01;
+    static constexpr uint32_t Neg0      = 0x8000;
+    using bitwise_type                  = uint16_t;
+};
+#endif
+
+CK_TILE_HOST_DEVICE
+constexpr float int8_to_float(const int8_t& x) { return static_cast<float>(x); }
+
+CK_TILE_HOST_DEVICE
+constexpr int8_t float_to_int8(const float& x) { return static_cast<int8_t>(x); }
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp
index cb18cde70..4011e08ce 100644
--- a/include/ck_tile/core/numeric/type_convert.hpp
+++ b/include/ck_tile/core/numeric/type_convert.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
+#include "ck_tile/core/numeric/int8.hpp"
 
 namespace ck_tile {
 
@@ -60,6 +61,9 @@ CK_TILE_TYPE_CONVERT(bf16_t, bf16, float, float)
 CK_TILE_TYPE_CONVERT(fp8_t, fp8, float, float)
 CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float)
 
+CK_TILE_TYPE_CONVERT(float, float, int8_t, int8)
+CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
+
 #undef CK_TILE_TYPE_CONVERT
 #endif
 
diff --git a/include/ck_tile/core/tensor/null_tile_window.hpp b/include/ck_tile/core/tensor/null_tile_window.hpp
index 9707f2990..de99be196 100644
--- a/include/ck_tile/core/tensor/null_tile_window.hpp
+++ b/include/ck_tile/core/tensor/null_tile_window.hpp
@@ -80,6 +80,13 @@ CK_TILE_DEVICE constexpr auto make_tile_window(null_tensor_view,
     return null_tile_window<remove_cvref_t<WindowLengths>>{window_lengths};
 }
 
+template <typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto make_tile_window(const null_tile_window<WindowLengths>& t,
+                                               const StaticTileDistribution&)
+{
+    return t;
+}
+
 template <typename WindowLengths>
 CK_TILE_DEVICE void
 move_tile_window(null_tile_window<WindowLengths>&,
diff --git a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
index 837f52c39..62cd26b6a 100644
--- a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
@@ -8,20 +8,44 @@
 
 namespace ck_tile {
 
+// Note: for simplicity, each functor only care about single M
+struct reference_layernorm2d_default_epilogue
+{
+    template <typename OutDataType, typename AccDataType>
+    void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
+    {
+        const int N = acc.mDesc.get_lengths()[1];
+        for(int n = 0; n < N; ++n)
+        {
+            o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
+        }
+    }
+
+    template <typename OutDataType, typename AccDataType>
+    auto operator()(int m, const HostTensor<AccDataType>& acc)
+    {
+        HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
+        operator()(m, o, acc);
+        return o;
+    }
+};
+
 template <typename XDataType,
           typename GammaDataType,
           typename BetaDataType,
           typename ComputeDataType,
           typename YDataType,
           typename MeanDataType,
-          typename InvStdDataType>
+          typename InvStdDataType,
+          typename Epilogue = reference_layernorm2d_default_epilogue>
 void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
                                const HostTensor<GammaDataType>& gamma_n,
                                const HostTensor<BetaDataType>& beta_n,
                                HostTensor<YDataType>& y_m_n,
                                HostTensor<MeanDataType>& mean_m,
                                HostTensor<InvStdDataType>& invStd_m,
-                               ComputeDataType epsilon)
+                               ComputeDataType epsilon,
+                               Epilogue epilogue_functor = {})
 {
     auto layernorm2d_fwd_func = [&](auto m) {
         const int N = x_m_n.mDesc.get_lengths()[1];
@@ -51,16 +75,19 @@ void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
         if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
             invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);
 
+        HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
         for(int n = 0; n < N; ++n)
         {
             ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
             ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
             ComputeDataType beta  = ck_tile::type_convert<ComputeDataType>(beta_n(n));
-            auto y                = (x - mean) * divisor;
-            y                     = y * gamma + beta;
+            auto a_               = (x - mean) * divisor;
+            a_                    = a_ * gamma + beta;
 
-            y_m_n(m, n) = ck_tile::type_convert<YDataType>(y);
+            acc(m, n) = a_;
         }
+
+        epilogue_functor(m, y_m_n, acc);
     };
 
     make_ParallelTensorFunctor(layernorm2d_fwd_func,
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
index eb06fea2d..fb8d7221b 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
@@ -9,4 +9,5 @@
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/common.hpp b/include/ck_tile/ops/common.hpp
index 4363ea1f5..1510f18a3 100644
--- a/include/ck_tile/ops/common.hpp
+++ b/include/ck_tile/ops/common.hpp
@@ -3,4 +3,5 @@
 
 #pragma once
 
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp b/include/ck_tile/ops/common/generic_2d_block_shape.hpp
similarity index 96%
rename from include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp
rename to include/ck_tile/ops/common/generic_2d_block_shape.hpp
index e4b60331e..64ad20c3b 100644
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp
+++ b/include/ck_tile/ops/common/generic_2d_block_shape.hpp
@@ -1,11 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
-#include "ck_tile/core.hpp"
-
 namespace ck_tile {
+
 /*
 // clang-format off
 
@@ -42,7 +41,7 @@ template <typename BlockTile_,    // block size, seq<M, N>
           typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
           index_t BlockSize_ =
               warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
-struct Layernorm2dShape
+struct Generic2dBlockShape
 {
     // block size
     static constexpr index_t Block_M = BlockTile_::at(number<0>{});
diff --git a/include/ck_tile/ops/elementwise.hpp b/include/ck_tile/ops/elementwise.hpp
index 62ba9dc0b..cd1e43fb8 100644
--- a/include/ck_tile/ops/elementwise.hpp
+++ b/include/ck_tile/ops/elementwise.hpp
@@ -4,4 +4,5 @@
 #pragma once
 
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/epilogue.hpp b/include/ck_tile/ops/epilogue.hpp
index a98f60b36..c24744bdb 100644
--- a/include/ck_tile/ops/epilogue.hpp
+++ b/include/ck_tile/ops/epilogue.hpp
@@ -5,4 +5,6 @@
 
 #include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
 #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
+#include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 5dc49c3b0..7c5d5a6f3 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -9,23 +9,29 @@ namespace ck_tile {
 
 // this epilogue just store out a M*N matrix, row major
 
-template <typename AccDataType_, typename ODataType_, bool kPadM_, bool kPadN_>
+template <typename AccDataType_,
+          typename ODataType_,
+          bool kPadM_,
+          bool kPadN_,
+          bool UseRawStore_ = true>
 struct Default2DEpilogueProblem
 {
-    using AccDataType           = remove_cvref_t<AccDataType_>;
-    using ODataType             = remove_cvref_t<ODataType_>;
-    static constexpr bool kPadM = kPadM_;
-    static constexpr bool kPadN = kPadN_;
+    using AccDataType                 = remove_cvref_t<AccDataType_>;
+    using ODataType                   = remove_cvref_t<ODataType_>;
+    static constexpr bool kPadM       = kPadM_;
+    static constexpr bool kPadN       = kPadN_;
+    static constexpr bool UseRawStore = UseRawStore_;
 };
 
 template <typename Problem_, typename Policy_ = void>
 struct Default2DEpilogue
 {
-    using Problem               = remove_cvref_t<Problem_>;
-    using AccDataType           = remove_cvref_t<typename Problem::AccDataType>;
-    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
-    static constexpr bool kPadM = Problem::kPadM;
-    static constexpr bool kPadN = Problem::kPadN;
+    using Problem                     = remove_cvref_t<Problem_>;
+    using AccDataType                 = remove_cvref_t<typename Problem::AccDataType>;
+    using ODataType                   = remove_cvref_t<typename Problem::ODataType>;
+    static constexpr bool kPadM       = Problem::kPadM;
+    static constexpr bool kPadN       = Problem::kPadN;
+    static constexpr bool UseRawStore = Problem::UseRawStore;
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }
 
@@ -36,7 +42,7 @@ struct Default2DEpilogue
     {
 
         // TODO: this is ugly
-        if constexpr(kPadM || kPadN)
+        if constexpr(UseRawStore && (kPadM || kPadN))
         {
             store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
             buffer_store_fence();
diff --git a/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
new file mode 100644
index 000000000..2e2960411
--- /dev/null
+++ b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+
+namespace ck_tile {
+
+template <bool kPadM_, bool kPadN_, bool UseRawStore_ = true, bool UseMax3_ = false>
+struct DynamicQuantEpilogueTraits
+{
+    static constexpr bool kPadM       = kPadM_;
+    static constexpr bool kPadN       = kPadN_;
+    static constexpr bool UseRawStore = UseRawStore_;
+    static constexpr bool UseMax3     = UseMax3_;
+};
+
+// this epilogue just store out a M*N matrix, row major
+template <typename AccDataType_,
+          typename YScaleDataType_,
+          typename ODataType_,
+          typename BlockShape_,
+          typename Traits_>
+struct DynamicQuantEpilogueProblem
+{
+    using AccDataType    = remove_cvref_t<AccDataType_>;
+    using YScaleDataType = remove_cvref_t<YScaleDataType_>;
+    using ODataType      = remove_cvref_t<ODataType_>;
+    using BlockShape     = remove_cvref_t<BlockShape_>; // can consum generic 2d shape
+    using Traits         = remove_cvref_t<Traits_>;
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct DynamicQuantEpilogue
+{
+    using Problem                     = remove_cvref_t<Problem_>;
+    using AccDataType                 = remove_cvref_t<typename Problem::AccDataType>;
+    using YScaleDataType              = remove_cvref_t<typename Problem::YScaleDataType>;
+    using ODataType                   = remove_cvref_t<typename Problem::ODataType>;
+    using BlockShape                  = remove_cvref_t<typename Problem::BlockShape>;
+    static constexpr bool kPadM       = Problem::Traits::kPadM;
+    static constexpr bool kPadN       = Problem::Traits::kPadN;
+    static constexpr bool UseRawStore = Problem::Traits::UseRawStore;
+    static constexpr bool UseMax3     = Problem::Traits::UseMax3;
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<AccDataType, AccDataType, BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<AccDataType, AccDataType, BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<AccDataType, AccDataType, BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync();
+        return reduce_crosswarp_sync.GetSmemSize();
+    }
+
+    // TODO: this function assume store out vector size is the same as OAccTile last dimension size
+    //       how do we fix this ?
+    template <typename ODramWindowTmp, typename YScaleWindow, typename OAccTile>
+    CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
+                                   YScaleWindow& y_scale_window,
+                                   const OAccTile& o_acc_tile,
+                                   void* smem)
+    {
+        auto reduce                = GetBlockReduce2d();
+        auto reduce_sync           = GetBlockReduce2dSync();
+        auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync();
+
+        const auto f_absmax = [](auto acc_, auto v_0_) { return max(acc_, abs(v_0_)); };
+
+        auto row_absmax = [&]() {
+            constexpr auto y_size_per_row =
+                OAccTile{}.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(
+                    number<1>{});
+            // constexpr auto y_size_per_row = OAccTile::get_lengths()[number<1>{}];
+            if constexpr(UseMax3 && std::is_same_v<AccDataType, float> && y_size_per_row % 2 == 0)
+            {
+                // fast max3 implementation
+                const auto f_max3 = [](auto acc_, auto v_0_, auto v_1_) {
+                    float rtn;
+                    asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                                 : "=v"(rtn)
+                                 : "v"(acc_), "v"(v_0_), "v"(v_1_));
+                    return rtn;
+                };
+                return reduce(o_acc_tile, type_convert<AccDataType>(0), f_max3, sequence<1, 2>{});
+            }
+            else
+            {
+                return reduce(o_acc_tile, type_convert<AccDataType>(0), f_absmax);
+            }
+        }();
+        reduce_sync(row_absmax, f_absmax);
+        reduce_crosswarp_sync(row_absmax, smem, f_absmax);
+
+        // here y_scale is Acc TYpe, need convert to YScale type later
+        auto y_scale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<AccDataType>(numeric<ODataType>::max());
+            },
+            row_absmax);
+
+        store_tile(y_scale_window, cast_tile<YScaleDataType>(y_scale));
+
+        auto o_acc_scaled_tile =
+            make_static_distributed_tensor<AccDataType>(o_acc_tile.get_tile_distribution());
+
+        sweep_tile(o_acc_tile, [&](auto idx) {
+            constexpr auto row_id  = make_tuple(idx[number<0>{}]);
+            o_acc_scaled_tile(idx) = o_acc_tile[idx] / y_scale(row_id);
+        });
+
+        // TODO: this is ugly
+        if constexpr(UseRawStore && (kPadM || kPadN))
+        {
+            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_scaled_tile));
+            buffer_store_fence();
+        }
+        else
+        {
+            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_scaled_tile));
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 9389a5397..e106264ce 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -43,4 +43,5 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index c3e028528..ac74782a3 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -39,4 +39,5 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/image_to_column.hpp b/include/ck_tile/ops/image_to_column.hpp
index 57e83a7a5..2b02bcc5d 100644
--- a/include/ck_tile/ops/image_to_column.hpp
+++ b/include/ck_tile/ops/image_to_column.hpp
@@ -6,4 +6,5 @@
 #include "ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp"
 #include "ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp"
 #include "ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/layernorm2d.hpp b/include/ck_tile/ops/layernorm2d.hpp
index 2a403b0f4..711c5d859 100644
--- a/include/ck_tile/ops/layernorm2d.hpp
+++ b/include/ck_tile/ops/layernorm2d.hpp
@@ -4,9 +4,10 @@
 #pragma once
 
 #include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp"
-#include "ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_shape.hpp"
 #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp"
 #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
index cebe5131a..9a2e06d05 100644
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -5,19 +5,24 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp"
 
 namespace ck_tile {
 
 // host side args
 struct Layernorm2dFwdHostArgs
 {
-    const void* p_x;
-    const void* p_gamma;
-    const void* p_beta;
-
-    void* p_y;
-    void* p_mean;
-    void* p_invStd;
+    const void* p_x;          // [m ,n], input, fp16/bf16
+    const void* p_x_residual; // [m ,n], shortcut input, prec same as input, nullptr if not used
+    const void* p_x_scale;    // [1 ,n], smooth scale input, fp32, nullptr if not used
+    const void* p_gamma;      // [1, n], gamma, prec same as input
+    const void* p_beta;       // [1, n], beta, prec same as input
+
+    void* p_y;          // [m, n], output, fp16/bf16
+    void* p_y_residual; // [m, n], shortcut output, prec same as input, nullptr if not used
+    void* p_y_scale;    // [m, 1], output a dynamic quant per row, nullptr if not used
+    void* p_mean;       // [m, 1], output mean, prec same as input, nullptr if not used
+    void* p_invStd;     // [m, 1], output inv-stdvariance, prec same as input, nullptr if not used
 
     float epsilon;
 
@@ -27,10 +32,11 @@ struct Layernorm2dFwdHostArgs
 };
 
 // TODO: Extract some type to wrapper class
-template <typename Pipeline_>
+template <typename Pipeline_, typename Epilogue_>
 struct Layernorm2dFwd
 {
     using Pipeline = remove_cvref_t<Pipeline_>;
+    using Epilogue = remove_cvref_t<Epilogue_>;
     using Problem  = typename Pipeline::Problem;
 
     using XDataType       = remove_cvref_t<typename Problem::XDataType>;
@@ -40,18 +46,26 @@ struct Layernorm2dFwd
     using YDataType       = remove_cvref_t<typename Problem::YDataType>;
     using MeanDataType    = remove_cvref_t<typename Problem::MeanDataType>;
     using InvStdDataType  = remove_cvref_t<typename Problem::InvStdDataType>;
+    using XScaleDataType  = remove_cvref_t<typename Problem::XScaleDataType>;
+    using YScaleDataType  = remove_cvref_t<typename Problem::YScaleDataType>;
+
+    // for simplicity, shortcut input/output type is same as X
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
 
     static constexpr bool kHasGamma       = !std::is_same_v<GammaDataType, null_type>;
     static constexpr bool kHasBeta        = !std::is_same_v<BetaDataType, null_type>;
-    static constexpr bool kSaveMeanInvStd = Problem::kSaveMeanInvStd;
-    static constexpr bool kSaveMean       = Problem::kSaveMeanInvStd;
-    static constexpr bool kSaveInvStd     = Problem::kSaveMeanInvStd;
-
-    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
-    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
-    static constexpr bool kPadM      = false; // always no need to pad along M
-    static constexpr bool kPadN      = Problem::kPadN;
-    static constexpr bool kTwoPass   = Problem::kTwoPass;
+    static constexpr bool kSaveMeanInvStd = Problem::Traits::kSaveMeanInvStd;
+    static constexpr bool kSaveMean       = Problem::Traits::kSaveMeanInvStd;
+    static constexpr bool kSaveInvStd     = Problem::Traits::kSaveMeanInvStd;
+
+    static constexpr index_t Block_M  = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N  = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM       = false; // always no need to pad along M
+    static constexpr bool kPadN       = Problem::Traits::kPadN;
+    static constexpr bool kTwoPass    = Problem::Traits::kTwoPass;
+    static constexpr auto kFusedAdd   = Problem::Traits::kFusedAdd;
+    static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant;
 
     static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
     static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
@@ -62,13 +76,18 @@ struct Layernorm2dFwd
 
     struct Kargs
     {
-        const void* p_x;
-        const void* p_gamma;
-        const void* p_beta;
+        const void* p_x;          // [m ,n], input, fp16/bf16
+        const void* p_x_residual; // [m ,n], shortcut input, prec same as input, nullptr if not used
+        const void* p_x_scale;    // [1 ,n], smooth scale input, fp32, nullptr if not used
+        const void* p_gamma;      // [1, n], gamma, prec same as input
+        const void* p_beta;       // [1, n], beta, prec same as input
 
-        void* p_y;
-        void* p_mean;
-        void* p_invStd;
+        void* p_y;          // [m, n], output, fp16/bf16
+        void* p_y_residual; // [m, n], shortcut output, prec same as input, nullptr if not used
+        void* p_y_scale;    // [m, 1], output a dynamic quant per row, nullptr if not used
+
+        void* p_mean;   // [m, 1], output mean, prec same as input, nullptr if not used
+        void* p_invStd; // [m, 1], output inv-stdvariance, prec same as input, nullptr if not used
 
         float epsilon;
 
@@ -81,9 +100,13 @@ struct Layernorm2dFwd
     CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
     {
         return Kargs{hargs.p_x,
+                     hargs.p_x_residual,
+                     hargs.p_x_scale,
                      hargs.p_gamma,
                      hargs.p_beta,
                      hargs.p_y,
+                     hargs.p_y_residual,
+                     hargs.p_y_scale,
                      hargs.p_mean,
                      hargs.p_invStd,
                      hargs.epsilon,
@@ -106,6 +129,7 @@ struct Layernorm2dFwd
     template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
     template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
     template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    template <> struct t2s<ck_tile::int8_t> { static constexpr const char * name = "int8"; };
     // clang-format on
 
     // in byte
@@ -113,24 +137,41 @@ struct Layernorm2dFwd
 
     CK_TILE_HOST static std::string GetName()
     {
+#define _SS_ std::string
+#define _TS_ std::to_string
         // clang-format off
         using S_ = typename Problem::BlockShape;
         auto surfix = [&] () {
             std::string n;
+            if (kFusedAdd != Layernorm2dFusedAddEnum::NO_ADD) n += _SS_("_") + Layernorm2dFusedAddEnumName<kFusedAdd>::name;
+            if (kFusedQuant != Layernorm2dFusedQuantEnum::NO_SWEEP) n += _SS_("_") + Layernorm2dFusedQuantEnumName<kFusedQuant>::name;
             if (kPadN) n += "_pn";
             if (kSaveMeanInvStd) n += "_mv";
-            if (kTwoPass) n += "_2p";
+            // if (kTwoPass) n += "_2p";
             return n; }();
 
-        #define _SS_  std::string
-        #define _TS_  std::to_string
-        return _SS_("layernorm2d_fwd_") + _SS_(t2s<XDataType>::name) + "_" + 
+        auto prec_str = [&] () {
+            std::string base_str = _SS_(t2s<XDataType>::name);
+            if (!std::is_same_v<XDataType, YDataType>) {
+                base_str += _SS_("_") + _SS_(t2s<YDataType>::name);
+            }
+            if (kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT) {
+                base_str += _SS_("_sx") + _SS_(t2s<XScaleDataType>::name);
+                base_str += _SS_("_sy") + _SS_(t2s<YScaleDataType>::name);
+            }
+            if (kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT) {
+                base_str += _SS_("_sy") + _SS_(t2s<YScaleDataType>::name);
+            }
+            return base_str;
+        }();
+
+        return _SS_("layernorm2d_fwd_") + _SS_(prec_str) + "_" + 
              _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
              _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
              _SS_(Pipeline::name) + surfix;
-        #undef _SS_
-        #undef _TS_
         // clang-format on
+#undef _SS_
+#undef _TS_
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -153,6 +194,31 @@ struct Layernorm2dFwd
                 tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
         }();
 
+        const auto x_residual_window = [&]() {
+            if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
+                         kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
+            {
+                const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                    static_cast<const XResidualDataType*>(kargs.p_x_residual),
+                    make_tuple(kargs.m, kargs.n),
+                    make_tuple(kargs.stride, 1),
+                    number<Vector_N>{},
+                    number<1>{});
+
+                // NOTE: we don't do any pad in this kernel for loading, assume that inside kernel
+                // will check the max count dynamically
+                const auto tmp2_ = pad_tensor_view(tmp_,
+                                                   make_tuple(number<Block_M>{}, number<Block_N>{}),
+                                                   sequence<false, false>{});
+                return make_tile_window(
+                    tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+            }
+            else
+            {
+                return make_null_tile_window(make_tuple(number<Block_M>{}, number<Block_N>{}));
+            }
+        }();
+
         const auto gamma_window = [&]() {
             const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<const GammaDataType*>(kargs.p_gamma),
@@ -194,6 +260,28 @@ struct Layernorm2dFwd
                 tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
         }();
 
+        auto y_residual_window = [&]() {
+            if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
+            {
+                auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                    static_cast<YResidualDataType*>(kargs.p_y_residual),
+                    make_tuple(kargs.m, kargs.n),
+                    make_tuple(kargs.stride, 1),
+                    number<Vector_N>{},
+                    number<1>{});
+
+                auto tmp2_ = pad_tensor_view(tmp_,
+                                             make_tuple(number<Block_M>{}, number<Block_N>{}),
+                                             sequence<kPadM, kPadN>{});
+                return make_tile_window(
+                    tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+            }
+            else
+            {
+                return make_null_tile_window(make_tuple(number<Block_M>{}, number<Block_N>{}));
+            }
+        }();
+
         auto mean_window = [&]() {
             if constexpr(kSaveMean)
             {
@@ -232,17 +320,60 @@ struct Layernorm2dFwd
                 return make_null_tile_window(make_tuple(number<Block_M>{}));
         }();
 
+        auto x_scale_window = [&]() {
+            if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT)
+            {
+                const auto win_ = [&]() {
+                    const auto tmp_0_ = make_naive_tensor_view_packed<address_space_enum::global>(
+                        static_cast<const XScaleDataType*>(kargs.p_x_scale),
+                        make_tuple(kargs.n),
+                        number<Vector_N>{});
+
+                    return pad_tensor_view(tmp_0_,
+                                           make_tuple(number<Block_N>{}),
+                                           sequence<false>{}); // x_scale no need pad
+                }();
+                return make_tile_window(win_, make_tuple(number<Block_N>{}), {0});
+            }
+            else
+                return make_null_tile_window(make_tuple(number<Block_N>{}));
+        }();
+
+        auto y_scale_window = [&]() {
+            if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT ||
+                         kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT)
+            {
+                const auto win_ = [&]() {
+                    const auto tmp_0_ = make_naive_tensor_view_packed<address_space_enum::global>(
+                        static_cast<YScaleDataType*>(kargs.p_y_scale),
+                        make_tuple(kargs.m),
+                        number<1>{});
+
+                    return pad_tensor_view(
+                        tmp_0_, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+                }();
+                return make_tile_window(win_, make_tuple(number<Block_M>{}), {iM});
+            }
+            else
+                return make_null_tile_window(make_tuple(number<Block_M>{}));
+        }();
+
         __shared__ char smem[GetSmemSize()];
 
         Pipeline{}(x_window,
+                   x_residual_window,
                    gamma_window,
                    beta_window,
                    y_window,
+                   y_residual_window,
                    mean_window,
                    inv_std_window,
+                   x_scale_window,
+                   y_scale_window,
                    static_cast<const ComputeDataType>(kargs.epsilon),
                    kargs.n,
-                   smem);
+                   smem,
+                   Epilogue{});
     }
 };
 
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index c767a472a..16a7c3b86 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp"
+#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp"
 #include <string>
 #include <type_traits>
 
@@ -24,20 +25,25 @@ struct Layernorm2dFwdPipelineOnePass
     using MeanDataType    = ck_tile::remove_cvref_t<typename Problem::MeanDataType>;
     using InvStdDataType  = ck_tile::remove_cvref_t<typename Problem::InvStdDataType>;
 
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
+
     static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
     static constexpr bool kHasBeta    = !std::is_same_v<BetaDataType, ck_tile::null_type>;
-    static constexpr bool kSaveMean   = Problem::kSaveMeanInvStd;
-    static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd;
+    static constexpr bool kSaveMean   = Problem::Traits::kSaveMeanInvStd;
+    static constexpr bool kSaveInvStd = Problem::Traits::kSaveMeanInvStd;
 
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
     static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
-    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool kPadN              = Problem::Traits::kPadN;
+    static constexpr auto kFusedAdd          = Problem::Traits::kFusedAdd;
+    static constexpr auto kFusedQuant        = Problem::Traits::kFusedQuant;
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
-            return "bpr_op"; // block per row
+            return "bpr"; // block per row
         else
-            return "wpr_op"; // warp per row
+            return "wpr"; // warp per row
     }();
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
@@ -46,20 +52,30 @@ struct Layernorm2dFwdPipelineOnePass
     }
 
     template <typename XWindow,
+              typename XResidualWindow,
               typename GammaWindow,
               typename BetaWindow,
               typename YWindow,
+              typename YResidualWindow,
               typename MeanWindow,
-              typename InvStdWindow>
+              typename InvStdWindow,
+              typename XScaleWindow,
+              typename YScaleWindow,
+              typename Epilogue>
     CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XResidualWindow& x_residual_window_,
                                    const GammaWindow& gamma_window_,
                                    const BetaWindow& beta_window_,
-                                   YWindow& y_window,
+                                   YWindow& y_window_,
+                                   const YResidualWindow& y_residual_window_,
                                    MeanWindow& mean_window,
                                    InvStdWindow& inv_std_window,
+                                   const XScaleWindow& x_scale_window_,
+                                   YScaleWindow& y_scale_window,
                                    ComputeDataType epsilon,
                                    ck_tile::index_t row_size,
-                                   void* smem) const
+                                   void* smem,
+                                   Epilogue) const
     {
         const auto x_window =
             make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
@@ -67,8 +83,17 @@ struct Layernorm2dFwdPipelineOnePass
             gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
         const auto beta_window = make_tile_window(
             beta_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
+        const auto x_residual_window = make_tile_window(
+            x_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto y_residual_window = make_tile_window(
+            y_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        const auto x_scale_window = make_tile_window(
+            x_scale_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
+
+        auto x       = load_tile(x_window);
+        auto x_resi  = load_tile(x_residual_window);
+        auto x_scale = load_tile(x_scale_window);
 
-        const auto x  = load_tile(x_window);
         int cur_count = 0;
         int max_count =
             block_tile_welford_calculate_max_count<typename Problem::BlockShape>(row_size);
@@ -81,6 +106,18 @@ struct Layernorm2dFwdPipelineOnePass
         const auto gamma = load_tile(gamma_window);
         const auto beta  = load_tile(beta_window);
 
+        if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
+                     kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
+        {
+            sweep_tile(x_resi, [&](auto idx) {
+                // compute x = x_resi + x
+                x(idx) = type_convert<YResidualDataType>(x_resi(idx)) +
+                         type_convert<YResidualDataType>(x(idx));
+            });
+            if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
+                store_tile(y_residual_window, x);
+        }
+
         // compute welford each-thread->cross-lane->cross-warp
         auto [mean, var] = block_welford(x, cur_count, max_count);
         block_welford_sync(mean, var, cur_count);
@@ -100,8 +137,8 @@ struct Layernorm2dFwdPipelineOnePass
             store_tile(inv_std_window, cast_tile<InvStdDataType>(inv_std));
 
         // layernorm computation
-        auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
-        sweep_tile(y, [&, mean_ = mean](auto idx) {
+        auto ln = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+        sweep_tile(ln, [&, mean_ = mean](auto idx) {
             constexpr auto i_idx = make_tuple(idx[number<0>{}]);
             constexpr auto j_idx = make_tuple(idx[number<1>{}]);
 
@@ -109,11 +146,28 @@ struct Layernorm2dFwdPipelineOnePass
             const auto beta_  = type_convert<ComputeDataType>(beta[j_idx]);
 
             const auto x_ = type_convert<ComputeDataType>(x[idx]);
-            auto y_       = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
+            auto ln_      = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
 
-            y(idx) = type_convert<YDataType>(y_);
+            ln(idx) = ln_;
         });
-        store_tile(y_window, y);
+
+        if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT)
+        {
+            // smooth-quant pre-scale, then run rowwise-quant
+            sweep_tile(ln, [&](auto idx) {
+                constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+                const auto xs_       = type_convert<ComputeDataType>(x_scale[j_idx]);
+                ln(idx)              = ln(idx) * xs_;
+            });
+        }
+
+        if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT ||
+                     kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT)
+        {
+            Epilogue{}(y_window_, y_scale_window, ln, smem);
+        }
+        else
+            Epilogue{}(y_window_, ln);
     }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
index 8e9f8e81e..7ec830add 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
@@ -14,10 +14,10 @@ template <typename XDataType_,
           typename YDataType_,
           typename MeanDataType_,
           typename InvStdDataType_,
+          typename XScaleDataType_,
+          typename YScaleDataType_,
           typename BlockShape_,
-          bool kPadN_,
-          bool kSaveMeanInvStd_,
-          bool kTwoPass_>
+          typename Traits_>
 struct Layernorm2dFwdPipelineProblem
 {
     using XDataType       = remove_cvref_t<XDataType_>;
@@ -27,14 +27,14 @@ struct Layernorm2dFwdPipelineProblem
     using YDataType       = remove_cvref_t<YDataType_>;
     using MeanDataType    = remove_cvref_t<MeanDataType_>;
     using InvStdDataType  = remove_cvref_t<InvStdDataType_>;
+    using XScaleDataType  = remove_cvref_t<XScaleDataType_>;
+    using YScaleDataType  = remove_cvref_t<YScaleDataType_>;
     using BlockShape      = remove_cvref_t<BlockShape_>;
 
     static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
     static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
 
-    static constexpr bool kPadN           = kPadN_;
-    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
-    static constexpr bool kTwoPass        = kTwoPass_;
+    using Traits = remove_cvref_t<Traits_>;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index e35d02e70..ec10efbc6 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -24,20 +24,25 @@ struct Layernorm2dFwdPipelineTwoPass
     using MeanDataType    = ck_tile::remove_cvref_t<typename Problem::MeanDataType>;
     using InvStdDataType  = ck_tile::remove_cvref_t<typename Problem::InvStdDataType>;
 
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
+
     static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
     static constexpr bool kHasBeta    = !std::is_same_v<BetaDataType, ck_tile::null_type>;
-    static constexpr bool kSaveMean   = Problem::kSaveMeanInvStd;
-    static constexpr bool kSaveInvStd = Problem::kSaveMeanInvStd;
+    static constexpr bool kSaveMean   = Problem::Traits::kSaveMeanInvStd;
+    static constexpr bool kSaveInvStd = Problem::Traits::kSaveMeanInvStd;
 
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
     static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
-    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool kPadN              = Problem::Traits::kPadN;
+    static constexpr auto kFusedAdd          = Problem::Traits::kFusedAdd;
+    static constexpr auto kFusedQuant        = Problem::Traits::kFusedQuant;
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
-            return "bpr_tp"; // block per row
+            return "bpr_2p"; // block per row
         else
-            return "wpr_tp"; // warp per row
+            return "wpr_2p"; // warp per row
     }();
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
@@ -46,20 +51,30 @@ struct Layernorm2dFwdPipelineTwoPass
     }
 
     template <typename XWindow,
+              typename XResidualWindow,
               typename GammaWindow,
               typename BetaWindow,
               typename YWindow,
+              typename YResidualWindow,
               typename MeanWindow,
-              typename InvStdWindow>
+              typename InvStdWindow,
+              typename XScaleWindow,
+              typename YScaleWindow,
+              typename Epilogue>
     CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XResidualWindow& x_residual_window_,
                                    const GammaWindow& gamma_window_,
                                    const BetaWindow& beta_window_,
                                    YWindow& y_window,
+                                   const YResidualWindow& y_residual_window_,
                                    MeanWindow& mean_window,
                                    InvStdWindow& inv_std_window,
+                                   const XScaleWindow& /*x_scale_window*/,
+                                   YScaleWindow& /*y_scale_window*/,
                                    ComputeDataType epsilon,
                                    ck_tile::index_t row_size,
-                                   void* smem) const
+                                   void* smem,
+                                   Epilogue) const
     {
         auto x_window =
             make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
@@ -67,6 +82,10 @@ struct Layernorm2dFwdPipelineTwoPass
             gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
         auto beta_window = make_tile_window(
             beta_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
+        auto x_residual_window = make_tile_window(
+            x_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto y_residual_window = make_tile_window(
+            y_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
 
         // Problem::BlockShape
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
@@ -93,9 +112,26 @@ struct Layernorm2dFwdPipelineTwoPass
 
         for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
         {
-            const auto x = load_tile(x_window);
-            block_welford(x, mean, var, cur_count, max_count);
+            auto x      = load_tile(x_window);
+            auto x_resi = load_tile(x_residual_window);
+
             move_tile_window(x_window, {0, Block_N});
+            move_tile_window(x_residual_window, {0, Block_N});
+            if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
+                         kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
+            {
+                sweep_tile(x_resi, [&](auto idx) {
+                    // compute x = x_resi + x
+                    x(idx) = type_convert<YResidualDataType>(x_resi(idx)) +
+                             type_convert<YResidualDataType>(x(idx));
+                });
+                if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
+                {
+                    store_tile(y_residual_window, x);
+                    move_tile_window(y_residual_window, {0, Block_N});
+                }
+            }
+            block_welford(x, mean, var, cur_count, max_count);
         }
 
         block_welford_sync(mean, var, cur_count);
@@ -119,6 +155,7 @@ struct Layernorm2dFwdPipelineTwoPass
             row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
 
         move_tile_window(x_window, {0, -Block_N});
+        move_tile_window(x_residual_window, {0, -Block_N});
         move_tile_window(gamma_window, {stride_to_right_most_window});
         move_tile_window(beta_window, {stride_to_right_most_window});
         move_tile_window(y_window, {0, stride_to_right_most_window});
@@ -126,14 +163,24 @@ struct Layernorm2dFwdPipelineTwoPass
         // layernorm computation
         for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
         {
-            const auto x = load_tile(x_window);
+            auto x      = load_tile(x_window);
+            auto x_resi = load_tile(x_residual_window);
+            if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
+                         kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
+            {
+                sweep_tile(x_resi, [&](auto idx) {
+                    // compute x = x_resi + x
+                    x(idx) = type_convert<YResidualDataType>(x_resi(idx)) +
+                             type_convert<YResidualDataType>(x(idx));
+                });
+            }
             // load gamma/beta (TODO: support no gamma/beta?)
             const auto gamma = load_tile(gamma_window);
             const auto beta  = load_tile(beta_window);
 
-            auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
+            auto ln = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
 
-            sweep_tile(y, [&, mean_ = mean](auto idx) {
+            sweep_tile(ln, [&, mean_ = mean](auto idx) {
                 constexpr auto i_idx = make_tuple(idx[number<0>{}]);
                 constexpr auto j_idx = make_tuple(idx[number<1>{}]);
 
@@ -141,14 +188,16 @@ struct Layernorm2dFwdPipelineTwoPass
                 const auto beta_  = type_convert<ComputeDataType>(beta[j_idx]);
 
                 const auto x_ = type_convert<ComputeDataType>(x[idx]);
-                auto y_       = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
+                auto ln_      = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
 
-                y(idx) = type_convert<YDataType>(y_);
+                ln(idx) = ln_;
             });
 
-            store_tile(y_window, y);
+            static_assert(kFusedQuant != Layernorm2dFusedQuantEnum::DYNAMIC_QUANT);
+            Epilogue{}(y_window, ln);
 
             move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(x_residual_window, {0, -Block_N});
             move_tile_window(gamma_window, {-Block_N});
             move_tile_window(beta_window, {-Block_N});
             move_tile_window(y_window, {0, -Block_N});
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
new file mode 100644
index 000000000..fb327f74a
--- /dev/null
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+enum class Layernorm2dFusedAddEnum
+{
+    NO_ADD = 0,
+    // fused add before layernorm and store result to global
+    PRE_ADD_STORE = 1,
+    // fused add before layernorm, but not store result
+    PRE_ADD = 2,
+};
+
+// clang-format off
+template<Layernorm2dFusedAddEnum> struct Layernorm2dFusedAddEnumName;
+template<> struct Layernorm2dFusedAddEnumName<Layernorm2dFusedAddEnum::NO_ADD> { static constexpr const char * name = "no"; };
+template<> struct Layernorm2dFusedAddEnumName<Layernorm2dFusedAddEnum::PRE_ADD_STORE> { static constexpr const char * name = "pras"; };
+template<> struct Layernorm2dFusedAddEnumName<Layernorm2dFusedAddEnum::PRE_ADD> { static constexpr const char * name = "pra"; };
+// clang-format on
+
+enum class Layernorm2dFusedQuantEnum
+{
+    NO_SWEEP             = 0,
+    SMOOTH_DYNAMIC_QUANT = 1, // smooth oulier + rowwise quant, need input x-scale and store y_scale
+    DYNAMIC_QUANT        = 2, // rowwise quant, store out a y-scale
+};
+
+// clang-format off
+template<Layernorm2dFusedQuantEnum> struct Layernorm2dFusedQuantEnumName;
+template<> struct Layernorm2dFusedQuantEnumName<Layernorm2dFusedQuantEnum::NO_SWEEP> { static constexpr const char * name = "no"; };
+template<> struct Layernorm2dFusedQuantEnumName<Layernorm2dFusedQuantEnum::DYNAMIC_QUANT> { static constexpr const char * name = "dqt"; };
+template<> struct Layernorm2dFusedQuantEnumName<Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT> { static constexpr const char * name = "smdqt"; };
+// clang-format on
+
+template <bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kTwoPass_,
+          Layernorm2dFusedAddEnum kFusedAdd_,
+          Layernorm2dFusedQuantEnum kFusedQuant_>
+struct Layernorm2dFwdTraits
+{
+    static constexpr bool kPadN                            = kPadN_;
+    static constexpr bool kSaveMeanInvStd                  = kSaveMeanInvStd_;
+    static constexpr bool kTwoPass                         = kTwoPass_;
+    static constexpr Layernorm2dFusedAddEnum kFusedAdd     = kFusedAdd_;
+    static constexpr Layernorm2dFusedQuantEnum kFusedQuant = kFusedQuant_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/permute.hpp b/include/ck_tile/ops/permute.hpp
index ee8c69372..990e9ecc0 100644
--- a/include/ck_tile/ops/permute.hpp
+++ b/include/ck_tile/ops/permute.hpp
@@ -5,4 +5,5 @@
 
 #include "ck_tile/ops/permute/kernel/generic_permute_kernel.hpp"
 #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index fe2d24044..aa617ee2b 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -7,4 +7,5 @@
 #include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index fa3007d1e..c93329bfb 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -301,7 +301,10 @@ struct BlockReduce2D
                     .get_static_tile_distribution_encoding(),
                 ReduceDim{}));
 
-        return make_static_distributed_tensor<InDataType>(acc_dstr);
+        auto dst_ = make_static_distributed_tensor<InDataType>(acc_dstr);
+        // init acc_tensor
+        tile_elementwise_inout([&](auto& x_) { x_ = type_convert<InDataType>(reduce_init); }, dst_);
+        return dst_;
     }
 
     // return number of pixels each lane need to reduce
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index beb8c718e..3c6814711 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -17,14 +17,24 @@ struct BlockReduce2d
 
     CK_TILE_DEVICE constexpr BlockReduce2d() {}
 
-    template <typename XDistributedTensor_, typename YDistributedTensor_, typename ReduceFunc>
+    template <typename XDistributedTensor_,
+              typename YDistributedTensor_,
+              typename ReduceFunc,
+              typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
     CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
                                    YDistributedTensor_& y_tensor,
-                                   const ReduceFunc& reduce_func)
+                                   const ReduceFunc& reduce_func,
+                                   ReducePacksPerXDim = {})
     {
+        sweep_tile<XDistributedTensor_>(
+            [&](auto... idx_) {
+                constexpr auto idx_0 = make_tuple(make_tuple(idx_[number<0>{}]...)[number<0>{}]);
+                y_tensor(idx_0)      = reduce_func(y_tensor(idx_0), x_tensor[idx_]...);
+            },
+            ReducePacksPerXDim{});
+#if 0
         constexpr auto I0 = number<0>{};
         constexpr auto I1 = number<1>{};
-
         constexpr auto spans = XDistributedTensor_::get_distributed_spans();
 
         // FIXME: hard coded to reduce 2nd axis
@@ -42,6 +52,7 @@ struct BlockReduce2d
 
             y_tensor(y_dstr_idx) = y;
         });
+#endif
     }
 
     template <typename XDistributedTensor_>
@@ -63,14 +74,17 @@ struct BlockReduce2d
         return tensor;
     }
 
-    template <typename XDistributedTensor_, typename ReduceFunc>
+    template <typename XDistributedTensor_,
+              typename ReduceFunc,
+              typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
     CK_TILE_DEVICE auto operator()(const XDistributedTensor_& x_tensor,
                                    const ComputeDataType& reduce_init,
-                                   const ReduceFunc& reduce_func)
+                                   const ReduceFunc& reduce_func,
+                                   ReducePacksPerXDim = {})
     {
         auto y_tensor = MakeYBlockTile<XDistributedTensor_>();
         set_tile(y_tensor, reduce_init);
-        (*this)(x_tensor, y_tensor, reduce_func);
+        (*this)(x_tensor, y_tensor, reduce_func, ReducePacksPerXDim{});
 
         return y_tensor;
     }
diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp
index 98c60f1b5..f0a6cf960 100644
--- a/include/ck_tile/ops/rmsnorm2d.hpp
+++ b/include/ck_tile/ops/rmsnorm2d.hpp
@@ -9,4 +9,5 @@
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/softmax.hpp b/include/ck_tile/ops/softmax.hpp
index 584ca7068..4df34e1e0 100644
--- a/include/ck_tile/ops/softmax.hpp
+++ b/include/ck_tile/ops/softmax.hpp
@@ -5,4 +5,5 @@
 
 #include "ck_tile/ops/softmax/block/block_softmax_2d.hpp"
 #include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/topk.hpp b/include/ck_tile/ops/topk.hpp
index b1143e4a0..fcae3e02d 100644
--- a/include/ck_tile/ops/topk.hpp
+++ b/include/ck_tile/ops/topk.hpp
@@ -5,4 +5,5 @@
 
 #include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp"
 #include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/topk_softmax.hpp b/include/ck_tile/ops/topk_softmax.hpp
index 809473d53..cc7dbffee 100644
--- a/include/ck_tile/ops/topk_softmax.hpp
+++ b/include/ck_tile/ops/topk_softmax.hpp
@@ -7,4 +7,5 @@
 #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp"
 #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp"
 #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/welford.hpp b/include/ck_tile/ops/welford.hpp
index ebf940683..a4c479dd9 100644
--- a/include/ck_tile/ops/welford.hpp
+++ b/include/ck_tile/ops/welford.hpp
@@ -6,4 +6,5 @@
 #include "ck_tile/ops/welford/block/block_welford.hpp"
 #include "ck_tile/ops/welford/block/block_welford_problem.hpp"
 #include "ck_tile/ops/welford/thread/thread_welford.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
-- 
GitLab


From 550248deecf974959df8175010221de88b79246f Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Fri, 1 Nov 2024 11:52:50 +0800
Subject: [PATCH 030/153] [layernorm] hot fix (#1620)

* hot fix ln

* some rename
---
 .../02_layernorm2d/layernorm2d_fwd.cpp        | 31 ++++++++++++-------
 .../layernorm2d_fwd_pipeline_one_pass.hpp     |  5 +--
 .../layernorm2d_fwd_pipeline_two_pass.hpp     | 10 +++---
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
index 43f4e8c72..8f029c212 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -127,9 +127,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::HostTensor<XScaleDataType> x_scale_host_dev({n});
 
     ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
+    ck_tile::FillUniformDistribution<XScaleDataType>{-1.f, 1.f}(x_scale_host);
     ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
     ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
-    ck_tile::FillUniformDistribution<XScaleDataType>{-1.f, 1.f}(x_scale_host);
 
     ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
@@ -212,7 +213,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
                            x_host.mData.cend(),
                            x_residual_host.mData.cbegin(),
                            x_host.mData.begin(),
-                           std::plus<XDataType>{});
+                           [](auto x_, auto r_) {
+                               auto o_ = ck_tile::type_convert<ComputeDataType>(x_) +
+                                         ck_tile::type_convert<ComputeDataType>(r_);
+                               return ck_tile::type_convert<XDataType>(o_);
+                           });
         }
         ck_tile::reference_layernorm2d_fwd<XDataType,
                                            GammaDataType,
@@ -280,10 +285,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
         y_buf.FromDevice(y_host_dev.data());
 
-        ck_tile::HostTensor<YResidualDataType> sy_host_dev({m, n}, {stride, 1});
+        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {stride, 1});
         if(fused_add == 1)
         {
-            y_residual_buf.FromDevice(sy_host_dev.data());
+            y_residual_buf.FromDevice(y_residual_host_dev.data());
         }
 
         auto [rtol, atol] = get_elimit<InDataType>();
@@ -294,8 +299,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
             if(fused_add == 1)
             {
-                pass &= ck_tile::check_err(
-                    sy_host_dev, x_host, std::string("ADD Error: Incorrect results!"), rtol, atol);
+                pass &= ck_tile::check_err(y_residual_host_dev,
+                                           x_host,
+                                           std::string("ADD Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
             }
         }
         else
@@ -314,12 +322,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                            atol);
                 if(fused_add == 1)
                 {
-                    std::vector<YResidualDataType> sy_host_dev_row(
-                        sy_host_dev.begin() + i_r * stride, sy_host_dev.begin() + i_r * stride + n);
-                    std::vector<YResidualDataType> sy_host_ref_row(
+                    std::vector<YResidualDataType> y_residual_host_dev_row(
+                        y_residual_host_dev.begin() + i_r * stride,
+                        y_residual_host_dev.begin() + i_r * stride + n);
+                    std::vector<YResidualDataType> y_residual_host_ref_row(
                         x_host.begin() + i_r * stride, x_host.begin() + i_r * stride + n);
-                    pass &= ck_tile::check_err(sy_host_dev_row,
-                                               sy_host_ref_row,
+                    pass &= ck_tile::check_err(y_residual_host_dev_row,
+                                               y_residual_host_ref_row,
                                                std::string("ADD[") + std::to_string(i_r) +
                                                    std::string("] Error: Incorrect results!"),
                                                rtol,
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index 16a7c3b86..5601f3a68 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -111,8 +111,9 @@ struct Layernorm2dFwdPipelineOnePass
         {
             sweep_tile(x_resi, [&](auto idx) {
                 // compute x = x_resi + x
-                x(idx) = type_convert<YResidualDataType>(x_resi(idx)) +
-                         type_convert<YResidualDataType>(x(idx));
+                auto re_ = type_convert<ComputeDataType>(x_resi(idx)) +
+                           type_convert<ComputeDataType>(x(idx));
+                x(idx) = type_convert<XDataType>(re_);
             });
             if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
                 store_tile(y_residual_window, x);
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index ec10efbc6..48f66739d 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -122,8 +122,9 @@ struct Layernorm2dFwdPipelineTwoPass
             {
                 sweep_tile(x_resi, [&](auto idx) {
                     // compute x = x_resi + x
-                    x(idx) = type_convert<YResidualDataType>(x_resi(idx)) +
-                             type_convert<YResidualDataType>(x(idx));
+                    auto re_ = type_convert<ComputeDataType>(x_resi(idx)) +
+                               type_convert<ComputeDataType>(x(idx));
+                    x(idx) = type_convert<XDataType>(re_);
                 });
                 if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
                 {
@@ -170,8 +171,9 @@ struct Layernorm2dFwdPipelineTwoPass
             {
                 sweep_tile(x_resi, [&](auto idx) {
                     // compute x = x_resi + x
-                    x(idx) = type_convert<YResidualDataType>(x_resi(idx)) +
-                             type_convert<YResidualDataType>(x(idx));
+                    auto re_ = type_convert<ComputeDataType>(x_resi(idx)) +
+                               type_convert<ComputeDataType>(x(idx));
+                    x(idx) = type_convert<XDataType>(re_);
                 });
             }
             // load gamma/beta (TODO: support no gamma/beta?)
-- 
GitLab


From fbd654545a2644f99c3e7a493ebcc2169938583b Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Fri, 1 Nov 2024 13:51:56 +0800
Subject: [PATCH 031/153] [Ck_tile] smoothquant (#1617)

* fix compile error

* fix typo of padding

* Add smoothquant op

* Add smoothquant instance library

* refine type

* add test script

* Re-generate smoothquant.hpp

* Always use 'current year' in copyright

* use Generic2dBlockShape instead

* Add vector = 8 instance back

* Find exe path automatically

* Simplify the api condition

* Remove debugging code

* update year

* Add blank line between function declaration

* explicitly cast return value to dim3

* refine return value

* Fix default warmup and repeat value

* Add comment

* refactor sommthquant cmake

* Add README

* Fix typo

---------

Co-authored-by: Po Yen, Chen <PoYen.Chen@amd.com>
---
 .../02_layernorm2d/script/perf_test.sh        |   5 +-
 .../02_layernorm2d/script/smoke_test.sh       |   3 +-
 .../10_rmsnorm2d/example_rmsnorm2d_fwd.cpp    |   2 +-
 .../instances/rmsnorm2d_fwd_api.cpp           |   9 +-
 .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp    |   2 +-
 .../ck_tile/10_rmsnorm2d/script/perf_test.sh  |   5 +-
 .../ck_tile/10_rmsnorm2d/script/smoke_test.sh |   3 +-
 .../add_rmsnorm2d_rdquant_fwd.hpp             |   6 +-
 .../example_add_rmsnorm2d_rdquant_fwd.cpp     |   8 +-
 .../add_rmsnorm2d_rdquant_fwd_api.cpp         |   9 +-
 .../script/perf_test.sh                       |   5 +-
 .../script/smoke_test.sh                      |   3 +-
 example/ck_tile/12_smoothquant/CMakeLists.txt |  24 ++
 example/ck_tile/12_smoothquant/README.md      |  21 ++
 .../12_smoothquant/example_smoothquant.cpp    | 237 ++++++++++++++++++
 .../smoothquant_bf16_n1024_instance.cpp       |  22 ++
 .../smoothquant_bf16_n1536_instance.cpp       |  13 +
 .../smoothquant_bf16_n2048_instance.cpp       |  14 ++
 .../smoothquant_bf16_n256_instance.cpp        |  12 +
 .../smoothquant_bf16_n3072_instance.cpp       |  14 ++
 .../smoothquant_bf16_n4096_instance.cpp       |  14 ++
 .../smoothquant_bf16_n4096_tp_instance.cpp    |  14 ++
 .../smoothquant_bf16_n512_instance.cpp        |  13 +
 .../smoothquant_bf16_n64_n128_instance.cpp    |  12 +
 .../smoothquant_bf16_n768_instance.cpp        |  12 +
 .../smoothquant_fp16_n1024_instance.cpp       |  22 ++
 .../smoothquant_fp16_n1536_instance.cpp       |  13 +
 .../smoothquant_fp16_n2048_instance.cpp       |  14 ++
 .../smoothquant_fp16_n256_instance.cpp        |  12 +
 .../smoothquant_fp16_n3072_instance.cpp       |  14 ++
 .../smoothquant_fp16_n4096_instance.cpp       |  14 ++
 .../smoothquant_fp16_n4096_tp_instance.cpp    |  14 ++
 .../smoothquant_fp16_n512_instance.cpp        |  13 +
 .../smoothquant_fp16_n64_n128_instance.cpp    |  12 +
 .../smoothquant_fp16_n768_instance.cpp        |  12 +
 .../instances/smoothquant_fwd_api.cpp         | 143 +++++++++++
 .../instances/smoothquant_instance_common.hpp |  62 +++++
 .../12_smoothquant/script/perf_test.sh        |  37 +++
 .../12_smoothquant/script/smoke_test.sh       |  30 +++
 .../ck_tile/12_smoothquant/smoothquant.cpp    | 218 ++++++++++++++++
 .../ck_tile/12_smoothquant/smoothquant.hpp    | 114 +++++++++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp |   1 -
 .../add_rmsnorm2d_rdquant_fwd_kernel.hpp      |  17 +-
 .../add_rmsnorm2d_rdquant_fwd_shape.hpp       |  78 ------
 ...2d_rdquant_fwd_pipeline_default_policy.hpp |   1 +
 .../kernel/layernorm2d_fwd_kernel.hpp         |   4 +-
 ...ayernorm2d_fwd_pipeline_default_policy.hpp |   1 +
 .../layernorm2d_fwd_pipeline_problem.hpp      |   2 +-
 .../pipeline/layernorm2d_fwd_traits.hpp       |   2 +-
 .../ops/reduce/block/block_reduce2d.hpp       |   3 +-
 include/ck_tile/ops/rmsnorm2d.hpp             |   1 -
 .../rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp |  12 +-
 .../rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp  |  78 ------
 .../rmsnorm2d_fwd_pipeline_default_policy.hpp |   1 +
 include/ck_tile/ops/smoothquant.hpp           |  12 +
 .../smoothquant/kernel/smoothquant_kernel.hpp | 176 +++++++++++++
 .../smoothquant_pipeline_default_policy.hpp   |  95 +++++++
 .../smoothquant_pipeline_one_pass.hpp         |  94 +++++++
 .../pipeline/smoothquant_pipeline_problem.hpp |  35 +++
 .../smoothquant_pipeline_two_pass.hpp         | 132 ++++++++++
 include/ck_tile/remod.py                      |   5 +-
 62 files changed, 1758 insertions(+), 219 deletions(-)
 create mode 100644 example/ck_tile/12_smoothquant/CMakeLists.txt
 create mode 100644 example/ck_tile/12_smoothquant/README.md
 create mode 100644 example/ck_tile/12_smoothquant/example_smoothquant.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp
 create mode 100644 example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp
 create mode 100755 example/ck_tile/12_smoothquant/script/perf_test.sh
 create mode 100755 example/ck_tile/12_smoothquant/script/smoke_test.sh
 create mode 100644 example/ck_tile/12_smoothquant/smoothquant.cpp
 create mode 100644 example/ck_tile/12_smoothquant/smoothquant.hpp
 delete mode 100644 include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
 delete mode 100644 include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
 create mode 100644 include/ck_tile/ops/smoothquant.hpp
 create mode 100644 include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
 create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp
 create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
 create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp
 create mode 100644 include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp

diff --git a/example/ck_tile/02_layernorm2d/script/perf_test.sh b/example/ck_tile/02_layernorm2d/script/perf_test.sh
index a34624536..5a34e1928 100755
--- a/example/ck_tile/02_layernorm2d/script/perf_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/perf_test.sh
@@ -1,6 +1,5 @@
-
-# run from top of ck folder
-EXE=build/bin/tile_example_layernorm2d_fwd
+#!/bin/sh
+EXE="$(find . -name tile_example_layernorm2d_fwd -type f | head -n 1)"
 
 $EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
 $EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
index d56406b6f..b7fd354bb 100755
--- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
@@ -1,6 +1,5 @@
 #!/bin/sh
-# call from top of CK folder
-EXE=./build/bin/tile_example_layernorm2d_fwd
+EXE="$(find . -name tile_example_layernorm2d_fwd -type f | head -n 1)"
 
 for fquant in "" "-fquant=1 -prec_o=int8"; do
 for pr_i in "fp16" "bf16" ; do
diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
index bb2c94901..34df7b74f 100644
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -69,7 +69,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using WarpTile   = ck_tile::sequence<1, 64>;
     using Vector     = ck_tile::sequence<1, 1>;
 
-    using Shape   = ck_tile::Rmsnorm2dShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape   = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
     using Problem = ck_tile::Rmsnorm2dFwdPipelineProblem<XDataType,
                                                          GammaDataType,
                                                          ComputeDataType,
diff --git a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
index f9cfe72de..b8697183f 100644
--- a/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
+++ b/example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
@@ -28,7 +28,6 @@ float rmsnorm2d_fwd_b16_(rmsnorm2d_fwd_traits /*t*/,
                          rmsnorm2d_fwd_args a,
                          const ck_tile::stream_config& s)
 {
-#if 1
     float r = -1;
     // clang-format off
     //                                            rm  rn  tm   tn  vn  pd    rms     2p
@@ -128,16 +127,12 @@ float rmsnorm2d_fwd_b16_(rmsnorm2d_fwd_traits /*t*/,
             r = rmsnorm2d_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  false, true>>(s, a);
     }
     return r;
-#else
-    return rmsnorm2d_fwd_<trait_<data_type,  1, 1,  1,  256, 4,  true,  false, false>>(s, a);
-#endif
     // clang-format on
 }
 
 float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t, rmsnorm2d_fwd_args a, const ck_tile::stream_config& s)
 {
 
-    float r = -1;
     if(t.data_type.compare("fp16") == 0)
     {
         return rmsnorm2d_fwd_b16_<ck_tile::fp16_t>(t, a, s);
@@ -146,8 +141,6 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t, rmsnorm2d_fwd_args a, const ck_tile:
     {
         return rmsnorm2d_fwd_b16_<ck_tile::bf16_t>(t, a, s);
     }
-    if(r < 0)
+    else
         throw std::runtime_error("Without supported instances!");
-
-    return r;
 }
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
index 756ecb2c4..b4d429d46 100644
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
@@ -97,7 +97,7 @@ struct rmsnorm2d_fwd_traits_
     using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
     using Vector     = ck_tile::sequence<1, Vector_N_>;
 
-    using Shape = ck_tile::Rmsnorm2dShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
 
     static constexpr bool kPadN       = kPadN_;
     static constexpr bool kSaveInvRms = kSaveInvRms_;
diff --git a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
index f3cfcc4b8..7b9d0820f 100755
--- a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
+++ b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
@@ -1,6 +1,5 @@
-
-# run from top of ck folder
-EXE=build/bin/tile_rmsnorm2d_fwd
+#!/bin/sh
+EXE="$(find . -name tile_rmsnorm2d_fwd -type f | head -n 1)"
 
 $EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
 $EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
diff --git a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
index 6ec5e846c..758d6de54 100755
--- a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
+++ b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
@@ -1,6 +1,5 @@
 #!/bin/sh
-# call from top of CK folder
-EXE=./build/bin/tile_rmsnorm2d_fwd
+EXE="$(find . -name tile_rmsnorm2d_fwd -type f | head -n 1)"
 
 for pr_i in "fp16" "bf16" ; do
 $EXE -prec=$pr_i -m=99  -n=13
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
index bf70d9d23..443b9b102 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -18,7 +18,7 @@ struct AddRmsnormRdquantTypeConfig<ck_tile::half_t>
     using BDataType       = ck_tile::half_t;
     using GammaDataType   = ck_tile::half_t;
     using XDataType       = ck_tile::half_t;
-    using YScaleDataType  = ck_tile::half_t;
+    using YScaleDataType  = float;
     using QYDataType      = ck_tile::int8_t;
     using ComputeDataType = float;
 };
@@ -30,7 +30,7 @@ struct AddRmsnormRdquantTypeConfig<ck_tile::bf16_t>
     using BDataType       = ck_tile::bf16_t;
     using GammaDataType   = ck_tile::bf16_t;
     using XDataType       = ck_tile::bf16_t;
-    using YScaleDataType  = ck_tile::bf16_t;
+    using YScaleDataType  = float;
     using QYDataType      = ck_tile::int8_t;
     using ComputeDataType = float;
 };
@@ -101,7 +101,7 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
     using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
     using Vector     = ck_tile::sequence<1, Vector_N_>;
 
-    using Shape = ck_tile::AddRmsnorm2dRdquantShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
 
     static constexpr bool kPadN      = kPadN_;
     static constexpr bool kSaveX     = kSaveX_;
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
index 40fabf7f5..ada4c6f2d 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -66,7 +66,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using BDataType       = DataType;
     using GammaDataType   = DataType;
     using XDataType       = DataType;
-    using YScaleDataType  = DataType;
+    using YScaleDataType  = float;
     using QYDataType      = ck_tile::int8_t;
     using ComputeDataType = float;
 
@@ -99,12 +99,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     constexpr bool kThreePass = true;
 
-    using BlockWarps = ck_tile::sequence<2, 2>;
-    using BlockTile  = ck_tile::sequence<2, 128>;
+    using BlockWarps = ck_tile::sequence<4, 1>;
+    using BlockTile  = ck_tile::sequence<4, 128>;
     using WarpTile   = ck_tile::sequence<1, 64>;
     using Vector     = ck_tile::sequence<1, 1>;
 
-    using Shape   = ck_tile::AddRmsnorm2dRdquantShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape   = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
     using Problem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem<ADataType,
                                                                    BDataType,
                                                                    GammaDataType,
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
index 57a0f254d..966c5bd02 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
@@ -28,7 +28,6 @@ float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits /*t*/,
                                      add_rmsnorm2d_rdquant_fwd_args a,
                                      const ck_tile::stream_config& s)
 {
-#if 1
     float r = -1;
     // clang-format off
     //                                                      rm  rn  tm   tn  vn   pd     x      3p
@@ -128,9 +127,6 @@ float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits /*t*/,
             r = add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 4, 1, 1024, 1,  true,  true, true>>(s, a);
     }
     return r;
-#else
-    return add_rmsnorm2d_rdquant_fwd_<trait_<data_type,  1, 1, 2,  128, 8,  true,  true, false>>(s, a);
-#endif
     // clang-format on
 }
 
@@ -139,7 +135,6 @@ float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits t,
                                 const ck_tile::stream_config& s)
 {
 
-    float r = -1;
     // Only support instance of save_x == true for now
     assert(t.save_x);
     if(t.data_type.compare("fp16") == 0)
@@ -150,8 +145,6 @@ float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits t,
     {
         return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::bf16_t>(t, a, s);
     }
-    if(r < 0)
+    else
         throw std::runtime_error("Without supported instances!");
-
-    return r;
 }
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh
index 11fd36488..d02b0bab3 100755
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/perf_test.sh
@@ -1,6 +1,5 @@
-
-# run from top of ck folder
-EXE=build/bin/tile_add_rmsnorm2d_rdquant_fwd
+#!/bin/sh
+EXE="$(find . -name tile_add_rmsnorm2d_rdquant_fwd -type f | head -n 1)"
 
 $EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
 $EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh
index 4a02cdcb6..b60f5fcf2 100755
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/script/smoke_test.sh
@@ -1,6 +1,5 @@
 #!/bin/sh
-# call from top of CK folder
-EXE=./build/bin/tile_add_rmsnorm2d_rdquant_fwd
+EXE="$(find . -name tile_add_rmsnorm2d_rdquant_fwd -type f | head -n 1)"
 
 for pr_i in "fp16" "bf16" ; do
 $EXE -prec=$pr_i -m=99  -n=13
diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt
new file mode 100644
index 000000000..09a56c6da
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -0,0 +1,24 @@
+function (add_smoothquant_example TARGET_NAME MAIN_SRC)
+    message("adding ${TARGET_NAME}")
+    # not using add_example_executable() to add target, since we don't want this to have
+    # to be included in "make all/install/check"
+    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
+    target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+    foreach(source IN LISTS ARGN)
+        list(APPEND INSTANCE_SRCS ${source})
+    endforeach()
+
+    target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS})
+
+    set(COMPILE_OPTIONS)
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+    target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS})
+endfunction(add_smoothquant_example TARGET_NAME MAIN_SRC)
+
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+
+add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS})
+add_smoothquant_example(tile_example_smoothquant example_smoothquant.cpp)
diff --git a/example/ck_tile/12_smoothquant/README.md b/example/ck_tile/12_smoothquant/README.md
new file mode 100644
index 000000000..d6b815f8c
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/README.md
@@ -0,0 +1,21 @@
+# smoothquant
+
+This folder contains example for smoothquant using ck_tile tile-programming implementation.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_smoothquant -j
+```
+This will result in an executable `build/bin/tile_smoothquant`
+
+## cmdline
+```
+args:
+          -m    m dimension (default:3328)
+          -n    m dimension (default:4096)
+          -v    cpu validation or not (default:1)
+       -prec    precision (default:fp16)
+```
diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
new file mode 100644
index 000000000..3a26eb6a7
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
@@ -0,0 +1,237 @@
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/smoothquant.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "0", "cold iter")
+        .insert("repeat", "1", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= n);
+
+    using XDataType       = DataType;
+    using XScaleDataType  = float;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XScaleDataType> xscale_host({n});
+
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XScaleDataType>{1e-3, .5f}(xscale_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem xscale_buf(xscale_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    xscale_buf.ToDevice(xscale_host.data());
+
+    constexpr bool kTwoPass = true;
+
+    using BlockWarps = ck_tile::sequence<2, 2>;
+    using BlockTile  = ck_tile::sequence<2, 128>;
+    using WarpTile   = ck_tile::sequence<1, 64>;
+    using Vector     = ck_tile::sequence<1, 1>;
+
+    using Shape   = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Problem = ck_tile::SmoothquantPipelineProblem<XDataType,
+                                                        XScaleDataType,
+                                                        ComputeDataType,
+                                                        YScaleDataType,
+                                                        QYDataType,
+                                                        Shape,
+                                                        true,
+                                                        kTwoPass>;
+
+    using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass<Problem>;
+    using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass<Problem>;
+    using Pipeline        = std::conditional_t<kTwoPass, TwoPassPipeline, OnePassPipeline>;
+    using Kernel          = ck_tile::Smoothquant<Pipeline>;
+
+    ck_tile::SmoothquantHostArgs args{x_buf.GetDeviceBuffer(),
+                                      xscale_buf.GetDeviceBuffer(),
+                                      yscale_buf.GetDeviceBuffer(),
+                                      qy_buf.GetDeviceBuffer(),
+                                      m,
+                                      n,
+                                      stride};
+
+    auto kargs = Kernel::MakeKargs(args);
+
+    const dim3 grids                       = Kernel::GridSize(args);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    auto s = ck_tile::stream_config{nullptr, true, 1, warmup, repeat};
+
+    ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto n_) {
+                auto v_xscale = ck_tile::type_convert<ComputeDataType>(xscale_host(n_));
+
+                for(int m_ = 0; m_ < m; ++m_)
+                {
+                    auto v_x       = ck_tile::type_convert<ComputeDataType>(x_host(m_, n_));
+                    y_host(m_, n_) = v_x * v_xscale;
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, xscale_host.get_element_space_size())(
+                std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << "[" << data_type << "]"
+                  << " m:" << m << ", n:" << n << ", stride:" << stride
+                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    /*else if(data_type == "bf16")
+    {
+        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+    }*/
+
+    return -3;
+}
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
new file mode 100644
index 000000000..b25361da2
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm   tn  vn   pd   2p
+#if 0
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
+
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
+#endif
+
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp
new file mode 100644
index 000000000..0a332fe41
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 4,  64, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 2, 128, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 1, 256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 6, 1, 256, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp
new file mode 100644
index 000000000..bdf5804e4
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn   vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 1, 256, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 256, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 8, 1, 256, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp
new file mode 100644
index 000000000..774c977f2
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp
new file mode 100644
index 000000000..c571ef443
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp
new file mode 100644
index 000000000..80e4b3a29
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..7f776a6e4
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp
new file mode 100644
index 000000000..12bc90b66
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 1, 4, 64, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 2, 4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 4, 4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 8, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp
new file mode 100644
index 000000000..1cee18606
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 1,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 2,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp
new file mode 100644
index 000000000..aca7f7eb4
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  3, 4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  6, 4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 12, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
new file mode 100644
index 000000000..be5fecaca
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm    tn  vn   pd    2p
+#if 0
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
+
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
+#endif
+
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp
new file mode 100644
index 000000000..59fe14875
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm   tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 4,   64, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 2,  128, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 1,  256, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 6, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp
new file mode 100644
index 000000000..a3710a6ab
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 1, 1,  256, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 2, 1,  256, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 4, 1,  256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 8, 1,  256, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp
new file mode 100644
index 000000000..2b1bca7aa
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp
new file mode 100644
index 000000000..205ba130e
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp
new file mode 100644
index 000000000..96503ac91
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn    pd     2p
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..36e5e0bb1
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp
new file mode 100644
index 000000000..f09932e29
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4, 64, 8,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4, 64, 4,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4, 64, 2,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
new file mode 100644
index 000000000..023cd0be6
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd      2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 1,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp
new file mode 100644
index 000000000..5dcf560c7
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd       2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  3,  4,  64, 4,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  6,  4,  64, 2,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 12,  4,  64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp
new file mode 100644
index 000000000..962755f6e
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "smoothquant.hpp"
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = smoothquant_traits_<DataType_,
+                                   Repeat_M_,
+                                   Repeat_N_,
+                                   ThreadPerBlock_M_,
+                                   ThreadPerBlock_N_,
+                                   Vector_N_,
+                                   kPadN_,
+                                   kTwoPass_>;
+
+template <typename data_type>
+float smoothquant_dispatch(smoothquant_traits /*t*/,
+                           smoothquant_args a,
+                           const ck_tile::stream_config& s)
+{
+    float r = -1;
+    // clang-format off
+    //                                         rm  rn  tm  tn  vn   pd    2p
+    if(a.n <= 64) {
+            r = smoothquant_<trait_<data_type, 1,  1,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 128) {
+        if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type, 1,  1,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type, 1,  2,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 256) {
+        if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1,  4,  64, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 512) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1,  4,  64, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2,  4,  64, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 8,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 768) {
+        if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3,  4,  64, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 6,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1,12,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 1024) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1, 2,  128, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 2,  128, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 2,  128, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 1536) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 4,   64, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 2,  128, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 6, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 2048) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1, 1,  256, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1,  256, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 8, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 3072) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 1,  128, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 1,  256, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 6, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 3, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 4096) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1,  256, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1, 1024, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.n > 4096) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1,  256, 8,  true, true>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 4,  true, true>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1, 1024, 2,  true, true>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4, 1, 1024, 1,  true, true>>(s, a);
+    }
+    return r;
+    // clang-format on
+}
+
+float smoothquant(smoothquant_traits t, smoothquant_args a, const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp16") == 0)
+    {
+        return smoothquant_dispatch<ck_tile::fp16_t>(t, a, s);
+    }
+    else if(t.data_type.compare("bf16") == 0)
+    {
+        return smoothquant_dispatch<ck_tile::bf16_t>(t, a, s);
+    }
+    else
+        throw std::runtime_error("Without supported instances!");
+}
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp
new file mode 100644
index 000000000..cdf93f6fc
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "smoothquant.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = smoothquant_args;
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = smoothquant_traits_<DataType_,
+                                   Repeat_M_,
+                                   Repeat_N_,
+                                   ThreadPerBlock_M_,
+                                   ThreadPerBlock_N_,
+                                   Vector_N_,
+                                   kPadN_,
+                                   kTwoPass_>;
+
+template <typename Traits_>
+float smoothquant_(const S& s, A a)
+{
+    using DataType = typename Traits_::DataType;
+
+    using PipelineProblem = ck_tile::SmoothquantPipelineProblem<
+        typename SmoothquantTypeConfig<DataType>::XDataType,
+        typename SmoothquantTypeConfig<DataType>::XScaleDataType,
+        typename SmoothquantTypeConfig<DataType>::ComputeDataType,
+        typename SmoothquantTypeConfig<DataType>::YScaleDataType,
+        typename SmoothquantTypeConfig<DataType>::QYDataType,
+        typename Traits_::Shape,
+        Traits_::kPadN,
+        Traits_::kTwoPass>;
+
+    using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::Smoothquant<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/example/ck_tile/12_smoothquant/script/perf_test.sh b/example/ck_tile/12_smoothquant/script/perf_test.sh
new file mode 100755
index 000000000..741eb32ec
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/script/perf_test.sh
@@ -0,0 +1,37 @@
+
+EXE="$(find . -name tile_smoothquant -type f | head -n 1)"
+
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec=fp16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
\ No newline at end of file
diff --git a/example/ck_tile/12_smoothquant/script/smoke_test.sh b/example/ck_tile/12_smoothquant/script/smoke_test.sh
new file mode 100755
index 000000000..d08e06396
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/script/smoke_test.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+EXE="$(find . -name tile_smoothquant -type f | head -n 1)"
+
+for pr_i in "fp16" "bf16" ; do
+$EXE -prec=$pr_i -m=99  -n=13
+$EXE -prec=$pr_i -m=17  -n=16
+$EXE -prec=$pr_i -m=1   -n=100
+$EXE -prec=$pr_i -m=4   -n=128
+$EXE -prec=$pr_i -m=80  -n=127
+$EXE -prec=$pr_i -m=22  -n=255 -stride=256
+$EXE -prec=$pr_i -m=7   -n=599
+$EXE -prec=$pr_i -m=19  -n=512
+$EXE -prec=$pr_i -m=33  -n=313 -stride=1000
+$EXE -prec=$pr_i -m=11  -n=510
+$EXE -prec=$pr_i -m=171 -n=676 -stride=818
+$EXE -prec=$pr_i -m=91  -n=636
+$EXE -prec=$pr_i -m=12  -n=768 -stride=800
+$EXE -prec=$pr_i -m=100 -n=766 -stride=812
+$EXE -prec=$pr_i -m=31  -n=1024
+$EXE -prec=$pr_i -m=64  -n=1000 -stride=1004
+$EXE -prec=$pr_i -m=8   -n=1501
+$EXE -prec=$pr_i -m=3   -n=1826
+$EXE -prec=$pr_i -m=5   -n=2040
+$EXE -prec=$pr_i -m=7   -n=2734
+$EXE -prec=$pr_i -m=1   -n=3182
+$EXE -prec=$pr_i -m=9   -n=4096
+$EXE -prec=$pr_i -m=3   -n=8192
+$EXE -prec=$pr_i -m=1   -n=10547
+$EXE -prec=$pr_i -m=3   -n=17134
+done
diff --git a/example/ck_tile/12_smoothquant/smoothquant.cpp b/example/ck_tile/12_smoothquant/smoothquant.cpp
new file mode 100644
index 000000000..ed01d654f
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/smoothquant.cpp
@@ -0,0 +1,218 @@
+#include "ck_tile/host.hpp"
+#include "smoothquant.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
+    std::string data_type = arg_parser.get_str("prec");
+    int kname             = arg_parser.get_int("kname");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= n);
+
+    using TypeConfig = SmoothquantTypeConfig<DataType>;
+
+    using XDataType       = typename TypeConfig::XDataType;
+    using XScaleDataType  = typename TypeConfig::XScaleDataType;
+    using YScaleDataType  = typename TypeConfig::YScaleDataType;
+    using QYDataType      = typename TypeConfig::QYDataType;
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XScaleDataType> xscale_host({n});
+
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XScaleDataType>{1e-3, .5f}(xscale_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem xscale_buf(xscale_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    xscale_buf.ToDevice(xscale_host.data());
+
+    std::cout << "[" << data_type << "]"
+              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+
+    smoothquant_traits traits{data_type};
+
+    smoothquant_args args{x_buf.GetDeviceBuffer(),
+                          xscale_buf.GetDeviceBuffer(),
+                          yscale_buf.GetDeviceBuffer(),
+                          qy_buf.GetDeviceBuffer(),
+                          m,
+                          n,
+                          stride};
+
+    float ave_time = smoothquant(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XScaleDataType) * n +
+                           sizeof(YScaleDataType) * m + sizeof(QYDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto n_) {
+                auto v_xscale = ck_tile::type_convert<ComputeDataType>(xscale_host(n_));
+
+                for(int m_ = 0; m_ < m; ++m_)
+                {
+                    auto v_x       = ck_tile::type_convert<ComputeDataType>(x_host(m_, n_));
+                    y_host(m_, n_) = v_x * v_xscale;
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, xscale_host.get_element_space_size())(
+                std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16")
+    {
+        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/12_smoothquant/smoothquant.hpp b/example/ck_tile/12_smoothquant/smoothquant.hpp
new file mode 100644
index 000000000..26a598db5
--- /dev/null
+++ b/example/ck_tile/12_smoothquant/smoothquant.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/smoothquant.hpp"
+#include <string>
+
+template <typename DataType>
+struct SmoothquantTypeConfig;
+
+template <>
+struct SmoothquantTypeConfig<ck_tile::half_t>
+{
+    using XDataType       = ck_tile::half_t;
+    using XScaleDataType  = float;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct SmoothquantTypeConfig<ck_tile::bf16_t>
+{
+    using XDataType       = ck_tile::bf16_t;
+    using XScaleDataType  = float;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+// runtime args
+struct smoothquant_args : public ck_tile::SmoothquantHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+struct smoothquant_traits_
+{
+    using DataType = ck_tile::remove_cvref_t<DataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (warpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / warpSize);
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % warpSize == 0);
+            return ThreadPerBlock_N_ / warpSize;
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN    = kPadN_;
+    static constexpr bool kTwoPass = kTwoPass_;
+};
+
+template <typename Traits_>
+float smoothquant_(const ck_tile::stream_config& s, smoothquant_args a);
+
+// This is the public API, will be generated by script
+struct smoothquant_traits
+{
+    std::string data_type;
+};
+
+float smoothquant(smoothquant_traits, smoothquant_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index e404e5019..9dd9a6ca3 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -11,3 +11,4 @@ add_subdirectory(06_permute)
 add_subdirectory(09_topk_softmax)
 add_subdirectory(10_rmsnorm2d)
 add_subdirectory(11_add_rmsnorm2d_rdquant)
+add_subdirectory(12_smoothquant)
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
index fb8d7221b..d06d8529a 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp"
-#include "ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp"
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
index 4a0e29035..f06910db3 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
@@ -9,15 +9,16 @@
 namespace ck_tile {
 
 // host side args
+// X = A + B, Y = Rmsnorm2d(X), QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
 struct AddRmsnorm2dRdquantFwdHostArgs
 {
-    const void* p_a;
-    const void* p_b;
-    const void* p_gamma;
+    const void* p_a;     // [m ,n], input, fp16/bf16
+    const void* p_b;     // [m ,n], input, fp16/bf16
+    const void* p_gamma; // [1, n], gamma, prec same as input
 
-    void* p_x;
-    void* p_yscale;
-    void* p_qy;
+    void* p_x;      // [m, n], output, p_a + p_b, fp16/bf16
+    void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of reuslt of rmsnorm2d(x)
+    void* p_qy;     // [m, n], output, result of quant tensor of rmsnorm2d(x) int8
 
     float epsilon;
 
@@ -90,7 +91,7 @@ struct AddRmsnorm2dRdquantFwd
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
     {
-        return integer_divide_ceil(hargs.m, Block_M);
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
     }
 
     CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
@@ -170,7 +171,7 @@ struct AddRmsnorm2dRdquantFwd
                 number<1>{});
 
             const auto tmp2_ =
-                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadM>{});
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});
 
             return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
         }();
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
deleted file mode 100644
index 4bc7db434..000000000
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_shape.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-/*
-// clang-format off
-
-4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
-
-                         Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
-        +<----------------------< Repeat_N(2)>--------------------->+
-        |                                                           |
-        +<--    <WarpPerBlock_N(2)>  -->+
-            Warp_N
-        +--------------+--------------+--------------+--------------+----+----------------+
- Warp_M | wrap_0       | wrap_1       |                             |    ^                ^
-        +--------------+--------------+                             |   <WarpPerBlock_M(2)> |
-        | wrap_2       | wrap_3       |                             |    v
-        +--------------+--------------+--------------+--------------+----+           Block_M
-        |                             |                             |
-        +                             +                             |
-        |                             |                             |                     v
-        +--------------+--------------+--------------+--------------+                     +
-
-        each Warp-tile (e.g 16 thrd per row)
-
-         Vector_N (contiguous pixels each thrd holds along N, or vector size)
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_0    | thrd_1    | thrd_2    | thrd_3    | ...         Vector_M
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_16   | thrd_17   | thrd_18   | thrd_19   | ...
-        +-----------+-----------+-----------+-----------+-----------+
-// clang-format on
-*/
-template <typename BlockTile_,    // block size, seq<M, N>
-          typename WarpPerBlock_, // num warps along seq<M, N>
-          typename WarpTile_,     // warp size, seq<M, N>
-          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
-          index_t BlockSize_ =
-              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
-struct AddRmsnorm2dRdquantShape
-{
-    // block size
-    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
-
-    // num warps along seq<M, N>, within each block
-    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
-
-    // warp size
-    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
-
-    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
-    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
-    // repeat of each thread along seq<M, N>
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    // vector size along seq<M, N>
-    static constexpr index_t Vector_M = Vector_::at(number<0>{});
-    static constexpr index_t Vector_N = Vector_::at(number<1>{});
-
-    static_assert(Warp_M % Vector_M == 0);
-    static_assert(Warp_N % Vector_N == 0);
-    // num of threads along seq<M, N>, within each warp
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t BlockSize = BlockSize_;
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
index 73ba633b1..0b9bae4e9 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_default_policy.hpp
@@ -26,6 +26,7 @@ struct AddRmsnorm2dRdquantFwdPipelineDefaultPolicy
                 sequence<1, 1, 2, 2>,
                 sequence<0, 3, 0, 3>>{});
     }
+
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution()
     {
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
index 9a2e06d05..f5a214ba5 100644
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -117,7 +117,7 @@ struct Layernorm2dFwd
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
     {
-        return (hargs.m + Block_M - 1) / Block_M;
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
     }
 
     CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
@@ -165,7 +165,7 @@ struct Layernorm2dFwd
             return base_str;
         }();
 
-        return _SS_("layernorm2d_fwd_") + _SS_(prec_str) + "_" + 
+        return _SS_("layernorm2d_fwd_") + _SS_(prec_str) + "_" +
              _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
              _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
              _SS_(Pipeline::name) + surfix;
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
index 6661cddf4..02fd5f7b9 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
@@ -26,6 +26,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy
                 sequence<1, 1, 2, 2>,
                 sequence<0, 3, 0, 3>>{});
     }
+
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution()
     {
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
index 7ec830add..17ff80f47 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
index fb327f74a..ed9e18be3 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index 3c6814711..d6ca98e7b 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -29,7 +29,8 @@ struct BlockReduce2d
         sweep_tile<XDistributedTensor_>(
             [&](auto... idx_) {
                 constexpr auto idx_0 = make_tuple(make_tuple(idx_[number<0>{}]...)[number<0>{}]);
-                y_tensor(idx_0)      = reduce_func(y_tensor(idx_0), x_tensor[idx_]...);
+                y_tensor(idx_0)      = reduce_func(
+                    y_tensor(idx_0), ck_tile::type_convert<ComputeDataType>(x_tensor[idx_])...);
             },
             ReducePacksPerXDim{});
 #if 0
diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp
index f0a6cf960..8d075dc5f 100644
--- a/include/ck_tile/ops/rmsnorm2d.hpp
+++ b/include/ck_tile/ops/rmsnorm2d.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp"
-#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp"
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
index 99084a25e..fd89cc36c 100644
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
@@ -11,11 +11,11 @@ namespace ck_tile {
 // host side args
 struct Rmsnorm2dFwdHostArgs
 {
-    const void* p_x;
-    const void* p_gamma;
+    const void* p_x;     // [m ,n], input, fp16/bf16
+    const void* p_gamma; // [1, n], gamma, prec same as input
 
-    void* p_y;
-    void* p_invRms;
+    void* p_y;      // [m, n], output, fp16/bf16
+    void* p_invRms; // [m, 1], output inv-rms, prec same as input, nullptr if not used
 
     float epsilon;
 
@@ -83,7 +83,7 @@ struct Rmsnorm2dFwd
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
     {
-        return (hargs.m + Block_M - 1) / Block_M;
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
     }
 
     CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
@@ -149,7 +149,7 @@ struct Rmsnorm2dFwd
                 number<1>{});
 
             const auto tmp2_ =
-                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadM>{});
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});
 
             return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
         }();
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
deleted file mode 100644
index fc4b9f470..000000000
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-/*
-// clang-format off
-
-4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
-
-                         Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
-        +<----------------------< Repeat_N(2)>--------------------->+
-        |                                                           |
-        +<--    <WarpPerBlock_N(2)>  -->+
-            Warp_N
-        +--------------+--------------+--------------+--------------+----+----------------+
- Warp_M | wrap_0       | wrap_1       |                             |    ^                ^
-        +--------------+--------------+                             |   <WarpPerBlock_M(2)> |
-        | wrap_2       | wrap_3       |                             |    v
-        +--------------+--------------+--------------+--------------+----+           Block_M
-        |                             |                             |
-        +                             +                             |
-        |                             |                             |                     v
-        +--------------+--------------+--------------+--------------+                     +
-
-        each Warp-tile (e.g 16 thrd per row)
-
-         Vector_N (contiguous pixels each thrd holds along N, or vector size)
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_0    | thrd_1    | thrd_2    | thrd_3    | ...         Vector_M
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_16   | thrd_17   | thrd_18   | thrd_19   | ...
-        +-----------+-----------+-----------+-----------+-----------+
-// clang-format on
-*/
-template <typename BlockTile_,    // block size, seq<M, N>
-          typename WarpPerBlock_, // num warps along seq<M, N>
-          typename WarpTile_,     // warp size, seq<M, N>
-          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
-          index_t BlockSize_ =
-              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
-struct Rmsnorm2dShape
-{
-    // block size
-    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
-
-    // num warps along seq<M, N>, within each block
-    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
-
-    // warp size
-    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
-
-    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
-    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
-    // repeat of each thread along seq<M, N>
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    // vector size along seq<M, N>
-    static constexpr index_t Vector_M = Vector_::at(number<0>{});
-    static constexpr index_t Vector_N = Vector_::at(number<1>{});
-
-    static_assert(Warp_M % Vector_M == 0);
-    static_assert(Warp_N % Vector_N == 0);
-    // num of threads along seq<M, N>, within each warp
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t BlockSize = BlockSize_;
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
index e4814cf45..b258dcbae 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
@@ -26,6 +26,7 @@ struct Rmsnorm2dFwdPipelineDefaultPolicy
                 sequence<1, 1, 2, 2>,
                 sequence<0, 3, 0, 3>>{});
     }
+
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution()
     {
diff --git a/include/ck_tile/ops/smoothquant.hpp b/include/ck_tile/ops/smoothquant.hpp
new file mode 100644
index 000000000..c9e459765
--- /dev/null
+++ b/include/ck_tile/ops/smoothquant.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
new file mode 100644
index 000000000..6ec333516
--- /dev/null
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+// host side args
+struct SmoothquantHostArgs
+{
+    const void* p_x;      // [m ,n], input, fp16/bf16
+    const void* p_xscale; // [1, n], input, columnwise scale, fp32
+
+    void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of (p_x * p_xscale)
+    void* p_qy;     // [m, n], output, p_x * p_xscale / p_yscale
+
+    index_t m;
+    index_t n;
+    index_t stride; // row_stride
+};
+
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct Smoothquant
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using YScaleDataType  = remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = remove_cvref_t<typename Problem::QYDataType>;
+
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kTwoPass   = Problem::kTwoPass;
+
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    struct Kargs
+    {
+        const void* p_x;
+        const void* p_xscale;
+
+        void* p_yscale;
+        void* p_qy;
+
+        index_t m;
+        index_t n;
+        index_t stride; // row_stride
+    };
+    using Hargs = SmoothquantHostArgs;
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        return Kargs{
+            hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kTwoPass) n += "_2p";
+            return n; }();
+
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("smoothquant_fwd_") + _SS_(t2s<XDataType>::name) + "_" +
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const auto iM = get_block_id() * Block_M;
+
+        const auto x_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XDataType*>(kargs.p_x),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        const auto xscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XScaleDataType*>(kargs.p_xscale),
+                make_tuple(kargs.n),
+                make_tuple(1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+        }();
+
+        auto yscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YScaleDataType*>(kargs.p_yscale),
+                make_tuple(kargs.m),
+                make_tuple(1),
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_M>{}), {iM});
+        }();
+
+        auto qy_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<QYDataType*>(kargs.p_qy),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        __shared__ char smem[GetSmemSize()];
+
+        Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.n, smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp
new file mode 100644
index 000000000..ff81e69f0
--- /dev/null
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+
+namespace ck_tile {
+
+struct SmoothquantPipelineDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXScaleBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<0, 1>, sequence<1, 2>>,
+                sequence<1, 1>,
+                sequence<0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::BlockShape>;
+
+            using block_reduce2d = BlockReduce2d<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::XDataType>(
+                    MakeXBlockTileDistribution<Problem>()));
+            using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile<x_block_tile>());
+
+            return GetBlockReduce2dCrossWarpSync<Problem>().template GetSmemSize<y_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
new file mode 100644
index 000000000..d5b3780de
--- /dev/null
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = SmoothquantPipelineDefaultPolicy>
+struct SmoothquantPipelineOnePass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_op"; // block per row
+        else
+            return "wpr_op"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XScaleWindow& xscale_window_,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ck_tile::index_t,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto xscale_window = make_tile_window(
+            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
+
+        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        const auto x      = load_tile(x_window);
+        const auto xscale = load_tile(xscale_window);
+        auto y            = tile_elementwise_in(
+            [&](const auto& a, const auto& b) {
+                return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+            },
+            x,
+            xscale);
+
+        // compute absmax, cross-lane->cross-warp
+        auto absmax = block_reduce2d(
+            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+
+        // quantize y to qy
+        auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+        sweep_tile(qy, [&](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            auto qy_             = y[idx] / yscale[i_idx];
+            qy(idx)              = saturates<QYDataType>{}(qy_);
+        });
+        store_tile(qy_window, qy);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp
new file mode 100644
index 000000000..37e09b58c
--- /dev/null
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+// Y = X * XScale, QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
+template <typename XDataType_,
+          typename XScaleDataType_,
+          typename ComputeDataType_,
+          typename YScaleDataType_,
+          typename QYDataType_,
+          typename BlockShape_,
+          bool kPadN_,
+          bool kTwoPass_>
+struct SmoothquantPipelineProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using XScaleDataType  = remove_cvref_t<XScaleDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YScaleDataType  = remove_cvref_t<YScaleDataType_>;
+    using QYDataType      = remove_cvref_t<QYDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+
+    static constexpr bool kPadN    = kPadN_;
+    static constexpr bool kTwoPass = kTwoPass_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
new file mode 100644
index 000000000..7878ef1d3
--- /dev/null
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = SmoothquantPipelineDefaultPolicy>
+struct SmoothquantPipelineTwoPass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_tp"; // block per row
+        else
+            return "wpr_tp"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XScaleWindow& xscale_window_,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto xscale_window = make_tile_window(
+            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
+
+        static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+
+        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(x_window)));
+        auto absmax       = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x      = load_tile(x_window);
+            const auto xscale = load_tile(xscale_window);
+            const auto y      = tile_elementwise_in(
+                [&](const auto& a, const auto& b) {
+                    return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+                },
+                x,
+                xscale);
+
+            block_reduce2d(y, absmax, reduce_absmax_func);
+
+            move_tile_window(x_window, {0, Block_N});
+            move_tile_window(xscale_window, {Block_N});
+        }
+
+        // compute absmax, cross-lane->cross-warp
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+
+        // reverse read x to reuse cache
+        ck_tile::index_t stride_to_right_most_window =
+            row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
+
+        move_tile_window(x_window, {0, -Block_N});
+        move_tile_window(xscale_window, {-Block_N});
+        move_tile_window(qy_window, {0, stride_to_right_most_window});
+
+        // recompute y and quantize y to qy
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x      = load_tile(x_window);
+            const auto xscale = load_tile(xscale_window);
+            const auto y      = tile_elementwise_in(
+                [&](const auto& a, const auto& b) {
+                    return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+                },
+                x,
+                xscale);
+
+            auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+            sweep_tile(qy, [&](auto idx) {
+                constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+                auto qy_             = y[idx] / yscale[i_idx];
+                qy(idx)              = saturates<QYDataType>{}(qy_);
+            });
+            store_tile(qy_window, qy);
+
+            move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(xscale_window, {0, -Block_N});
+            move_tile_window(qy_window, {0, -Block_N});
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py
index 0612d4238..b0d2c36ef 100644
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 import pathlib
 from pathlib import Path
 import subprocess
@@ -8,8 +9,8 @@ NS = 'ck_tile'
 OPS = 'ops'
 OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
 
-HEADER_COMMON = """// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-{datetime.now().year}, Advanced Micro Devices, Inc. All rights reserved.\n
 """
 
 # aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
-- 
GitLab


From 03c6448ba3c854195c61c817036b66af1fa0e844 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 31 Oct 2024 22:52:23 -0700
Subject: [PATCH 032/153] Reduce build time. (#1621)

* disable fp8 gemm_universal on gfx90a and gfx908 by default

* fix cmake syntax

* fix clang format

* add ifdefs in amd_xdlops

* disable fp8 gemm instances on gfx90a by default

* update readme
---
 CMakeLists.txt                                | 12 ++++++--
 README.md                                     | 14 +++++----
 .../gpu/CMakeLists.txt                        | 30 +++++++++++++++++--
 ...tiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp | 10 ++++---
 ...gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp |  5 ++--
 ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 10 ++++---
 profiler/src/profile_gemm_universal.cpp       |  8 ++++-
 .../test_gemm_universal_xdl.cpp               |  4 +--
 8 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a5180363..74628597a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,7 @@ if(GPU_TARGETS)
 else()
     set(USER_GPU_TARGETS 0)
 endif()
-find_package(hip)
+find_package(hip REQUIRED)
 # No assumption that HIP kernels are launched with uniform block size for backward compatibility
 # SWDEV-413293 and https://reviews.llvm.org/D155213
 math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
@@ -170,7 +170,10 @@ else()
         set(CK_GPU_TARGETS ${GPU_TARGETS})
     endif()
 endif()
-
+#if the user did not set GPU_TARGETS, delete whatever was set by HIP package
+if(NOT USER_GPU_TARGETS)
+    set(GPU_TARGETS "")
+endif()
 #make sure all the targets on the list are actually supported by the current compiler
 rocm_check_target_ids(SUPPORTED_GPU_TARGETS
         TARGETS ${CK_GPU_TARGETS})
@@ -187,6 +190,10 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1
     add_definitions(-DCK_USE_WMMA)
     set(CK_USE_WMMA "ON")
 endif()
+option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
+if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
+    add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
+endif()
 
 # CK config file to record supported datatypes, etc.
 configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
@@ -314,7 +321,6 @@ link_libraries(${OpenMP_gomp_LIBRARY})
 link_libraries(${OpenMP_pthread_LIBRARY})
 
 ## HIP
-find_package(HIP REQUIRED)
 # Override HIP version in config.h, if necessary.
 # The variables set by find_package() can't be overwritten,
 # therefore let's use intermediate variables.
diff --git a/README.md b/README.md
index 053406515..302173dc1 100644
--- a/README.md
+++ b/README.md
@@ -137,12 +137,11 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
 
     You can find instructions for running ckProfiler in [profiler](/profiler).
 
-Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
+Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
+However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
+crash. On average, you should expect each thread to use ~2Gb of RAM.
 Depending on the number of CPU cores and the amount of RAM on your system, you may want to
-limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
-
-By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
-crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
+limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use `-j32`.
 
 Additional cmake flags can be used to significantly speed-up the build:
 
@@ -154,6 +153,11 @@ Additional cmake flags can be used to significantly speed-up the build:
   `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
   other platforms have faster instances, such as `xdl` or `wmma`, available.
 
+* `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
+  such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not
+  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
+  architectures like the MI100/MI200 for the functional support only.
+
 ## Using sccache for building
 
 The default CK Docker images come with a pre-installed version of sccache, which supports clang
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index f82176ffc..6756c3351 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -67,6 +67,21 @@ function(add_instance_library INSTANCE_NAME)
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
+    # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
+    if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+    foreach(source IN LISTS ARGN)
+    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_multiply_multiply_xdl_f8")
+         message("removing gemm_multiply_multiply_f8 instance ${source} ")
+         list(REMOVE_ITEM ARGN "${source}")
+    endif()
+    endforeach()
+    foreach(source IN LISTS ARGN)
+    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
+         message("removing gemm_universal_f8 instance ${source} ")
+         list(REMOVE_ITEM ARGN "${source}")
+    endif()
+    endforeach()
+    endif()
     #only continue if there are some source files left on the list
     if(ARGN)
         set(INST_OBJ)
@@ -74,11 +89,20 @@ function(add_instance_library INSTANCE_NAME)
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
             if(source MATCHES "_xdl")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
-            elseif(ARGN MATCHES "_wmma")
+            elseif(source MATCHES "_wmma")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
-            elseif(ARGN MATCHES "mha")
+            elseif(source MATCHES "mha")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
             endif()
+            #only build the fp8 gemm instances for gfx908/90a if the build argument is set
+            if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+                if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                endif()
+                if(source MATCHES "gemm_multiply_multiply_f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                endif()
+            endif()
             set(offload_targets)
             foreach(target IN LISTS INST_TARGETS)
                     string(APPEND offload_targets "--offload-arch=${target} ")
@@ -108,7 +132,7 @@ function(add_instance_library INSTANCE_NAME)
 
         # flags to compress the library
         if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
-            message("Adding --offload-compress flag for ${INSTANCE_NAME}")
+            #message("Adding --offload-compress flag for ${INSTANCE_NAME}")
             target_compile_options(${INSTANCE_NAME} PRIVATE --offload-compress)
         endif()
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
index 8a24af1b8..b1b64ca85 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -36,12 +36,12 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -58,17 +58,18 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances = std
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    64,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,   128,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    64,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
     // clang-format on
     >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|       Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |                |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
@@ -90,6 +91,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std:
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,  128,    128, 16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<8, 8, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    16,  256,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    32,  256,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
index 3b930e989..658714d35 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -62,12 +62,12 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
@@ -90,6 +90,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    128, 16,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    128, 16,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    128,  8,   8,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index b621cad94..382ed5b5a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -35,12 +35,12 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -57,17 +57,18 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   128,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
     // clang-format on
     >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
@@ -97,6 +98,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    128, 16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
     // clang-format on
     >;
 } // namespace instance
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index f86dddc72..576bd009b 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -101,7 +101,9 @@ int profile_gemm_universal(int argc, char* argv[])
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
-    using F8   = ck::f8_t;
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+    using F8 = ck::f8_t;
+#endif
 
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -162,6 +164,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
@@ -178,6 +181,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
+#endif
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
@@ -194,6 +198,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
     }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
     else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
@@ -202,6 +207,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
+#endif
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/test/gemm_universal/test_gemm_universal_xdl.cpp b/test/gemm_universal/test_gemm_universal_xdl.cpp
index 0d29c5fb7..23b5c74dd 100644
--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl.cpp
@@ -56,7 +56,7 @@ class TestGemmUniversal_KM_NK
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
     std::tuple<      F16,       F16,             F16,     F16>,
-#if (defined CK_ENABLE_FP8)
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
     std::tuple<      F16,        F8,             F16,     F16>,
     std::tuple<       F8,       F16,             F16,     F16>,
     std::tuple<       F8,        F8,              F8,    BF16>,
@@ -66,7 +66,7 @@ using KernelTypes_MK_KN = ::testing::Types<
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
     std::tuple<      F16,       F16,             F16,     F16>,
-#if (defined CK_ENABLE_FP8)
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
     std::tuple<      F16,        F8,             F16,     F16>,
     std::tuple<       F8,       F16,             F16,     F16>,
     std::tuple<       F8,        F8,              F8,    BF16>,
-- 
GitLab


From cb6c5d39dcc76f370d06d0c4467a3650c8713c2b Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Sat, 2 Nov 2024 05:30:16 +0000
Subject: [PATCH 033/153] [CK_TILE] layernorm have more accurate residual
 (#1623)

* more accurate residual

* modify comment

* Fix literal case in README.md

---------

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 example/ck_tile/02_layernorm2d/README.md      |  4 +-
 example/ck_tile/02_layernorm2d/generate.py    |  5 +-
 .../ops/epilogue/dynamic_quant_epilogue.hpp   | 84 +++++++++++++++----
 ...ayernorm2d_fwd_pipeline_default_policy.hpp | 10 +--
 .../layernorm2d_fwd_pipeline_one_pass.hpp     | 34 +++-----
 .../layernorm2d_fwd_pipeline_two_pass.hpp     | 23 +++--
 6 files changed, 97 insertions(+), 63 deletions(-)

diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md
index 14c6fc0d6..3573d70cd 100644
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -69,7 +69,7 @@ args:
 ```
 
 ## limitations
-Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, N>8192 case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet.
+Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by default generated. Though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, `N>8192` case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. If need suport `N>8192` and `fused+residual+store`, you can use this example together with `12_smoothquant`, to construct layernorm+residual, and smoothquant, 2 kernels for this purpose.
 
 ```
 # some case
@@ -82,4 +82,4 @@ Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by d
 # standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant+fused-add-store, output in int8
 ./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1
 
-```
\ No newline at end of file
+```
diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index 300f6c05e..bf576db97 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -202,8 +202,9 @@ float layernorm2d_fwd_(const S& s, A a)
     using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
     using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
 
-    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, YScaleDataType, YDataType, typename Traits_::Shape,
-            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, false,  true/*max3*/>>;
+    static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
+    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, XScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, false,  true/*max3*/>>;
 
     using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
 
diff --git a/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
index 2e2960411..3dec404b4 100644
--- a/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp
@@ -8,17 +8,23 @@
 
 namespace ck_tile {
 
-template <bool kPadM_, bool kPadN_, bool UseRawStore_ = true, bool UseMax3_ = false>
+template <bool kPadM_,
+          bool kPadN_,
+          bool UseSmoothInputScale_,
+          bool UseRawStore_ = true,
+          bool UseMax3_     = false>
 struct DynamicQuantEpilogueTraits
 {
-    static constexpr bool kPadM       = kPadM_;
-    static constexpr bool kPadN       = kPadN_;
-    static constexpr bool UseRawStore = UseRawStore_;
-    static constexpr bool UseMax3     = UseMax3_;
+    static constexpr bool kPadM               = kPadM_;
+    static constexpr bool kPadN               = kPadN_;
+    static constexpr bool UseSmoothInputScale = UseSmoothInputScale_;
+    static constexpr bool UseRawStore         = UseRawStore_;
+    static constexpr bool UseMax3             = UseMax3_;
 };
 
 // this epilogue just store out a M*N matrix, row major
 template <typename AccDataType_,
+          typename XScaleDataType_,
           typename YScaleDataType_,
           typename ODataType_,
           typename BlockShape_,
@@ -26,17 +32,20 @@ template <typename AccDataType_,
 struct DynamicQuantEpilogueProblem
 {
     using AccDataType    = remove_cvref_t<AccDataType_>;
+    using XScaleDataType = remove_cvref_t<XScaleDataType_>;
     using YScaleDataType = remove_cvref_t<YScaleDataType_>;
     using ODataType      = remove_cvref_t<ODataType_>;
     using BlockShape     = remove_cvref_t<BlockShape_>; // can consum generic 2d shape
     using Traits         = remove_cvref_t<Traits_>;
 };
 
+// TODO: we should put descriptor creation function into policy
 template <typename Problem_, typename Policy_ = void>
 struct DynamicQuantEpilogue
 {
     using Problem                     = remove_cvref_t<Problem_>;
     using AccDataType                 = remove_cvref_t<typename Problem::AccDataType>;
+    using XScaleDataType              = remove_cvref_t<typename Problem::XScaleDataType>;
     using YScaleDataType              = remove_cvref_t<typename Problem::YScaleDataType>;
     using ODataType                   = remove_cvref_t<typename Problem::ODataType>;
     using BlockShape                  = remove_cvref_t<typename Problem::BlockShape>;
@@ -63,6 +72,33 @@ struct DynamicQuantEpilogue
         return BlockReduce2dCrossWarpSync<P_>{};
     }
 
+    CK_TILE_DEVICE static constexpr auto MakeSmoothInputScaleTileDistribution()
+    {
+        using S = BlockShape;
+#if 0
+        // don't remove this
+        // Note that if we set encoding purposely like this, you will result in compile fail
+        // TODO: x_scale create local-scratch to accept arbitrary acc input (with same length)
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<0, 1, 1>,
+                sequence<0, 0, 3>>{});
+#else
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<0, 1>, sequence<1, 2>>,
+                sequence<1, 1>,
+                sequence<0, 3>>{});
+#endif
+    }
+
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync();
@@ -71,8 +107,12 @@ struct DynamicQuantEpilogue
 
     // TODO: this function assume store out vector size is the same as OAccTile last dimension size
     //       how do we fix this ?
-    template <typename ODramWindowTmp, typename YScaleWindow, typename OAccTile>
+    template <typename ODramWindowTmp,
+              typename XScaleWindow,
+              typename YScaleWindow,
+              typename OAccTile>
     CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
+                                   const XScaleWindow& x_scale_window_,
                                    YScaleWindow& y_scale_window,
                                    const OAccTile& o_acc_tile,
                                    void* smem)
@@ -80,6 +120,18 @@ struct DynamicQuantEpilogue
         auto reduce                = GetBlockReduce2d();
         auto reduce_sync           = GetBlockReduce2dSync();
         auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync();
+        const auto x_scale_window =
+            make_tile_window(x_scale_window_, MakeSmoothInputScaleTileDistribution());
+
+        auto x_scale = load_tile(x_scale_window);
+
+        auto o_acc_tmp = o_acc_tile;
+
+        sweep_tile(o_acc_tmp, [&](auto idx) {
+            constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+            const auto xs_       = type_convert<AccDataType>(x_scale[j_idx]);
+            o_acc_tmp(idx)       = o_acc_tmp(idx) * xs_;
+        });
 
         const auto f_absmax = [](auto acc_, auto v_0_) { return max(acc_, abs(v_0_)); };
 
@@ -87,10 +139,9 @@ struct DynamicQuantEpilogue
             constexpr auto y_size_per_row =
                 OAccTile{}.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(
                     number<1>{});
-            // constexpr auto y_size_per_row = OAccTile::get_lengths()[number<1>{}];
             if constexpr(UseMax3 && std::is_same_v<AccDataType, float> && y_size_per_row % 2 == 0)
             {
-                // fast max3 implementation
+                // fast max3+abs implementation
                 const auto f_max3 = [](auto acc_, auto v_0_, auto v_1_) {
                     float rtn;
                     asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
@@ -98,11 +149,11 @@ struct DynamicQuantEpilogue
                                  : "v"(acc_), "v"(v_0_), "v"(v_1_));
                     return rtn;
                 };
-                return reduce(o_acc_tile, type_convert<AccDataType>(0), f_max3, sequence<1, 2>{});
+                return reduce(o_acc_tmp, type_convert<AccDataType>(0), f_max3, sequence<1, 2>{});
             }
             else
             {
-                return reduce(o_acc_tile, type_convert<AccDataType>(0), f_absmax);
+                return reduce(o_acc_tmp, type_convert<AccDataType>(0), f_absmax);
             }
         }();
         reduce_sync(row_absmax, f_absmax);
@@ -117,23 +168,20 @@ struct DynamicQuantEpilogue
 
         store_tile(y_scale_window, cast_tile<YScaleDataType>(y_scale));
 
-        auto o_acc_scaled_tile =
-            make_static_distributed_tensor<AccDataType>(o_acc_tile.get_tile_distribution());
-
-        sweep_tile(o_acc_tile, [&](auto idx) {
-            constexpr auto row_id  = make_tuple(idx[number<0>{}]);
-            o_acc_scaled_tile(idx) = o_acc_tile[idx] / y_scale(row_id);
+        sweep_tile(o_acc_tmp, [&](auto idx) {
+            constexpr auto row_id = make_tuple(idx[number<0>{}]);
+            o_acc_tmp(idx)        = o_acc_tmp[idx] / y_scale(row_id);
         });
 
         // TODO: this is ugly
         if constexpr(UseRawStore && (kPadM || kPadN))
         {
-            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_scaled_tile));
+            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tmp));
             buffer_store_fence();
         }
         else
         {
-            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_scaled_tile));
+            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tmp));
         }
     }
 };
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
index 02fd5f7b9..1de230c14 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
@@ -45,7 +45,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelford()
     {
-        using P_ = BlockWelfordProblem<typename Problem::XDataType,
+        using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                        typename Problem::ComputeDataType,
                                        typename Problem::BlockShape>;
 
@@ -55,7 +55,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordSync()
     {
-        using P_ = BlockWelfordProblem<typename Problem::XDataType,
+        using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                        typename Problem::ComputeDataType,
                                        typename Problem::BlockShape>;
 
@@ -65,7 +65,7 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordCrossWarpSync()
     {
-        using P_ = BlockWelfordProblem<typename Problem::XDataType,
+        using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                        typename Problem::ComputeDataType,
                                        typename Problem::BlockShape>;
 
@@ -77,13 +77,13 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     {
         if constexpr(Problem::kNeedCrossWarpSync)
         {
-            using P_ = BlockWelfordProblem<typename Problem::XDataType,
+            using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                            typename Problem::ComputeDataType,
                                            typename Problem::BlockShape>;
 
             using block_welford = BlockWelford<P_>;
             using x_block_tile =
-                decltype(make_static_distributed_tensor<typename Problem::XDataType>(
+                decltype(make_static_distributed_tensor<typename Problem::ComputeDataType>(
                     MakeXBlockTileDistribution<Problem>()));
             using mean_var_block_tile =
                 decltype(block_welford::template MakeMeanVarBlockTile<x_block_tile>());
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index 5601f3a68..83cdab428 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -87,12 +87,9 @@ struct Layernorm2dFwdPipelineOnePass
             x_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
         auto y_residual_window = make_tile_window(
             y_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
-        const auto x_scale_window = make_tile_window(
-            x_scale_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
 
-        auto x       = load_tile(x_window);
-        auto x_resi  = load_tile(x_residual_window);
-        auto x_scale = load_tile(x_scale_window);
+        auto x      = load_tile(x_window);
+        auto x_resi = load_tile(x_residual_window);
 
         int cur_count = 0;
         int max_count =
@@ -106,21 +103,21 @@ struct Layernorm2dFwdPipelineOnePass
         const auto gamma = load_tile(gamma_window);
         const auto beta  = load_tile(beta_window);
 
+        auto acc = cast_tile<ComputeDataType>(x);
+
         if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
                      kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
         {
             sweep_tile(x_resi, [&](auto idx) {
                 // compute x = x_resi + x
-                auto re_ = type_convert<ComputeDataType>(x_resi(idx)) +
-                           type_convert<ComputeDataType>(x(idx));
-                x(idx) = type_convert<XDataType>(re_);
+                acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
             });
             if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
-                store_tile(y_residual_window, x);
+                store_tile(y_residual_window, cast_tile<YResidualDataType>(acc));
         }
 
         // compute welford each-thread->cross-lane->cross-warp
-        auto [mean, var] = block_welford(x, cur_count, max_count);
+        auto [mean, var] = block_welford(acc, cur_count, max_count);
         block_welford_sync(mean, var, cur_count);
         block_welford_cross_warp_sync(mean, var, cur_count, smem);
         block_tile_welford_post_scale_var(var, cur_count);
@@ -138,7 +135,7 @@ struct Layernorm2dFwdPipelineOnePass
             store_tile(inv_std_window, cast_tile<InvStdDataType>(inv_std));
 
         // layernorm computation
-        auto ln = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+        auto ln = make_static_distributed_tensor<ComputeDataType>(acc.get_tile_distribution());
         sweep_tile(ln, [&, mean_ = mean](auto idx) {
             constexpr auto i_idx = make_tuple(idx[number<0>{}]);
             constexpr auto j_idx = make_tuple(idx[number<1>{}]);
@@ -146,26 +143,15 @@ struct Layernorm2dFwdPipelineOnePass
             const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
             const auto beta_  = type_convert<ComputeDataType>(beta[j_idx]);
 
-            const auto x_ = type_convert<ComputeDataType>(x[idx]);
-            auto ln_      = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
+            auto ln_ = (acc[idx] - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
 
             ln(idx) = ln_;
         });
 
-        if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT)
-        {
-            // smooth-quant pre-scale, then run rowwise-quant
-            sweep_tile(ln, [&](auto idx) {
-                constexpr auto j_idx = make_tuple(idx[number<1>{}]);
-                const auto xs_       = type_convert<ComputeDataType>(x_scale[j_idx]);
-                ln(idx)              = ln(idx) * xs_;
-            });
-        }
-
         if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT ||
                      kFusedQuant == Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT)
         {
-            Epilogue{}(y_window_, y_scale_window, ln, smem);
+            Epilogue{}(y_window_, x_scale_window_, y_scale_window, ln, smem);
         }
         else
             Epilogue{}(y_window_, ln);
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index 48f66739d..fadf56dfd 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -106,7 +106,7 @@ struct Layernorm2dFwdPipelineTwoPass
         auto block_welford_cross_warp_sync =
             Policy::template GetBlockWelfordCrossWarpSync<Problem>();
 
-        using XTensorType = decltype(load_tile(x_window));
+        using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(x_window)));
         auto mean         = block_welford.template MakeMeanVarBlockTile<XTensorType>();
         auto var          = block_welford.template MakeMeanVarBlockTile<XTensorType>();
 
@@ -117,22 +117,22 @@ struct Layernorm2dFwdPipelineTwoPass
 
             move_tile_window(x_window, {0, Block_N});
             move_tile_window(x_residual_window, {0, Block_N});
+            auto acc = cast_tile<ComputeDataType>(x);
+
             if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
                          kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
             {
                 sweep_tile(x_resi, [&](auto idx) {
                     // compute x = x_resi + x
-                    auto re_ = type_convert<ComputeDataType>(x_resi(idx)) +
-                               type_convert<ComputeDataType>(x(idx));
-                    x(idx) = type_convert<XDataType>(re_);
+                    acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
                 });
                 if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
                 {
-                    store_tile(y_residual_window, x);
+                    store_tile(y_residual_window, cast_tile<YResidualDataType>(acc));
                     move_tile_window(y_residual_window, {0, Block_N});
                 }
             }
-            block_welford(x, mean, var, cur_count, max_count);
+            block_welford(acc, mean, var, cur_count, max_count);
         }
 
         block_welford_sync(mean, var, cur_count);
@@ -166,21 +166,21 @@ struct Layernorm2dFwdPipelineTwoPass
         {
             auto x      = load_tile(x_window);
             auto x_resi = load_tile(x_residual_window);
+            auto acc    = cast_tile<ComputeDataType>(x);
+
             if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
                          kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
             {
                 sweep_tile(x_resi, [&](auto idx) {
                     // compute x = x_resi + x
-                    auto re_ = type_convert<ComputeDataType>(x_resi(idx)) +
-                               type_convert<ComputeDataType>(x(idx));
-                    x(idx) = type_convert<XDataType>(re_);
+                    acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
                 });
             }
             // load gamma/beta (TODO: support no gamma/beta?)
             const auto gamma = load_tile(gamma_window);
             const auto beta  = load_tile(beta_window);
 
-            auto ln = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+            auto ln = make_static_distributed_tensor<ComputeDataType>(acc.get_tile_distribution());
 
             sweep_tile(ln, [&, mean_ = mean](auto idx) {
                 constexpr auto i_idx = make_tuple(idx[number<0>{}]);
@@ -189,8 +189,7 @@ struct Layernorm2dFwdPipelineTwoPass
                 const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
                 const auto beta_  = type_convert<ComputeDataType>(beta[j_idx]);
 
-                const auto x_ = type_convert<ComputeDataType>(x[idx]);
-                auto ln_      = (x_ - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
+                auto ln_ = (acc(idx) - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
 
                 ln(idx) = ln_;
             });
-- 
GitLab


From 4f1fdbb6e3cae103eab134bb9c1b3001ee48f17f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 4 Nov 2024 22:34:17 +0100
Subject: [PATCH 034/153] Temporary disable part of dynamic op conv instances
 (#1630)

* Temporary disable part of dynamic op conv instances

* fix
---
 ...ouped_conv_fwd_xdl_dynamic_op_instance.hpp | 20 +++++++++++--------
 ...mic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  3 +++
 ...amic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  3 +++
 ...amic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  3 +++
 ...mic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp |  3 +++
 ..._op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  3 +++
 ...c_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp |  3 +++
 ...c_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp |  3 +++
 ..._op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp |  3 +++
 9 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
index 9db675a51..82c01a634 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
@@ -53,8 +53,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
         // instances for small conv.K and conv.C
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
@@ -68,6 +68,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        #endif
     // clang-format on
     >;
 
@@ -87,8 +88,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
         // instances for small conv.K and conv.C
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
@@ -102,6 +103,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        #endif
     // clang-format on
     >;
 
@@ -121,8 +123,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
         // instances for small conv.K and conv.C
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
@@ -136,6 +138,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        #endif
     // clang-format on
     >;
 
@@ -155,8 +158,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
         // instances for small conv.K and conv.C
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
@@ -170,6 +173,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        #endif
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 853470e1c..4ee20a0ca 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instanc
                                                               Tuple<>,
                                                               NHWGK,
                                                               ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2,
@@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instanc
                                                               Tuple<>,
                                                               NHWGK,
                                                               ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 725b9ca0d..18a616ef1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2,
@@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index fbd5fe370..850458f53 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2,
@@ -47,6 +49,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance
                                                              Tuple<>,
                                                              NHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
index 6bfc29537..f69bcf1a7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -30,6 +30,8 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instanc
                                                               Tuple<>,
                                                               NHWGK,
                                                               ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2,
@@ -46,6 +48,7 @@ void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instanc
                                                               Tuple<>,
                                                               NHWGK,
                                                               ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index 249dfaa4d..00c0ba3ea 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_inst
                                                               Tuple<>,
                                                               NDHWGK,
                                                               ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3,
@@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_inst
                                                               Tuple<>,
                                                               NDHWGK,
                                                               ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index 75c4ddc35..aa47bbdbe 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_insta
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3,
@@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_insta
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 2e237e07b..8df05d9da 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -31,6 +31,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_insta
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3,
@@ -47,6 +49,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_insta
                                                              Tuple<>,
                                                              NDHWGK,
                                                              ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
index e38f1acbd..c50b64917 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
@@ -30,6 +30,8 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_inst
                                                               Tuple<>,
                                                               NDHWGK,
                                                               ConvFwdDefault>{});
+#if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause
+      // long compilation time)
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3,
@@ -46,6 +48,7 @@ void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_inst
                                                               Tuple<>,
                                                               NDHWGK,
                                                               ConvFwd1x1S1P0>{});
+#endif
 }
 
 } // namespace instance
-- 
GitLab


From 0c9012fb70bcd2750ff0d5b8c23e4bc6f5937709 Mon Sep 17 00:00:00 2001
From: Lin Sun <linsun12@amd.com>
Date: Mon, 4 Nov 2024 16:33:20 -0800
Subject: [PATCH 035/153] Linsun/convint8 fwd instances (#1626)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add instances for int8 grouped conv2d fwd
---------

Co-authored-by: root <root@dell300x-pla-t28-03.pla.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp | 25 +++++++
 ...ped_conv_fwd_xdl_large_tensor_instance.hpp | 19 ++++++
 ...vice_grouped_conv_fwd_xdl_mem_instance.hpp | 37 +++++++++++
 ...ed_conv_fwd_xdl_merged_groups_instance.hpp | 19 ++++++
 .../gpu/grouped_convolution_forward.hpp       | 45 ++++++++++++-
 .../grouped_convolution_forward_comp_xdl.inc  | 32 +++++++++
 ...uped_convolution_forward_mem_inter_xdl.inc | 32 +++++++++
 ...uped_convolution_forward_mem_intra_xdl.inc | 32 +++++++++
 .../gpu/grouped_convolution_forward_xdl.inc   | 32 +++++++++
 ...d_convolution_forward_xdl_large_tensor.inc | 16 +++++
 ..._convolution_forward_xdl_merged_groups.inc | 30 +++++++++
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     | 11 ++++
 ...l_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp | 39 +++++++++++
 ...l_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp | 64 ++++++++++++++++++
 ...wd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp | 38 +++++++++++
 ...wd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 62 +++++++++++++++++
 ...tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 39 +++++++++++
 ...hw_gkyxc_ngkhw_int8_mem_inter_instance.cpp | 39 +++++++++++
 ...hw_gkyxc_ngkhw_int8_mem_intra_instance.cpp | 39 +++++++++++
 ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp | 66 +++++++++++++++++++
 ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp | 66 +++++++++++++++++++
 ...groups_ngchw_gkyxc_ngkhw_int8_instance.cpp | 48 ++++++++++++++
 ...groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp | 48 ++++++++++++++
 .../test_grouped_convnd_fwd.cpp               |  4 +-
 ...est_grouped_convnd_fwd_large_cases_xdl.cpp |  3 +-
 25 files changed, 880 insertions(+), 5 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index 7490ef223..dc4ee534b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -131,6 +131,31 @@ using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        // AGPR Spill when use permuted lds layout. so, use padding for these two.
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index 05cb8d5d0..d317d270c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -87,6 +87,25 @@ using device_grouped_conv_fwd_xdl_large_tensor_f32_instances = std::tuple<
         DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
     // clang-format on
     >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_large_tensor_int8_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
index 2388c4db0..1f381af08 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -154,6 +154,43 @@ using device_grouped_conv_fwd_xdl_f32_mem_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          BlockGemmPipelineScheduler BlkGemmPipeSched>
+using device_grouped_conv_fwd_xdl_int8_mem_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index 96baf6bb0..242ad2f73 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -90,6 +90,25 @@ using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_merged_groups_int8_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // Instances with NumGroupsPerBatch > 1
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 32>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index d884678de..8090b2449 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -122,6 +122,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif // DL_KERNELS
 
 #ifdef CK_USE_XDL
+        // 1D
+        // layout GNWC/GKXC/GNWK
         if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, GNWC> &&
                      is_same_v<WeiLayout, GKXC> && is_same_v<OutLayout, GNWK>)
         {
@@ -160,7 +162,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             }
 #endif
         }
-
+        // 2D
+        // layout GNHWC/GKYXC/GNHWK
         if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
         {
@@ -191,7 +194,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             }
 #endif
         }
-
+        // layout NHWGC/GKYXC/NHWGK
         if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
         {
@@ -247,8 +250,27 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
                     op_ptrs);
             }
+#endif
+#ifdef CK_ENABLE_INT8
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t> && is_same_v<AComputeType, int8_t> &&
+                         is_same_v<BComputeType, int8_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
+                    op_ptrs);
+            }
 #endif
         }
+
+        // layout NGCHW/GKYXC/NGKHW
         if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NGCHW> &&
                      is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NGKHW>)
         {
@@ -282,8 +304,26 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
             }
 #endif
+
+#ifdef CK_ENABLE_INT8
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t> && is_same_v<AComputeType, int8_t> &&
+                         is_same_v<BComputeType, int8_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
         }
 
+        // 3D
+        // layout GNDHWC/GKZYXC/GNDHWK
         if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&
                      is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, GNDHWK>)
         {
@@ -323,6 +363,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
         }
 
+        // layout NDHWGC/GKZYXC/NDHWGK
         if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
                      is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
         {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
index 9e4a0bbb6..e47a876e1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
@@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 // grouped conv2d forward, NGCHW/GKYXC/NGKHW
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(
@@ -90,6 +106,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
index d9470fb3f..f74622ad4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
@@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 // grouped conv2d forward, NGCHW/GKYXC/NGKHW
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances(
@@ -90,6 +106,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
index 0b1914255..81737b614 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
@@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 // grouped conv2d forward, NGCHW/GKYXC/NGKHW
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances(
@@ -90,6 +106,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index b1c13696c..4cb2aae09 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -171,6 +171,22 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 // grouped conv2d forward, NGCHW/GKYXC/NGKHW
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(
@@ -204,6 +220,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc
index 6a2c61d05..5f35ab5a4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_large_tensor.inc
@@ -57,6 +57,22 @@ void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instan
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
index 474a61e56..1bd2697b9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
@@ -85,6 +85,36 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_insta
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 083d92d09..98bee66a9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -9,45 +9,56 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKYXC, NGKHW
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
    # large tensor
    # NHWGC, GKYXC, NHWGK
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # merged groups
    # NHWGC, GKYXC, NHWGK
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKYXC, NGKHW
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
    #mem
    # NHWGC, GKYXC, NHWGK
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
    # NHWGC, GKYXC, NHWGK
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
    # NGCHW, GKYXC, NGKHW
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
    # NGCHW, GKYXC, NGKHW
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
    #comp
    # NHWGC, GKYXC, NHWGK
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
    # NGCHW, GKYXC, NGKHW
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
    #dl
    # GNHWC, GKYXC, GNHWK
    dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
new file mode 100644
index 000000000..d98b89c55
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_int8_comp_instances<2,
+                                                        NGCHW,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
new file mode 100644
index 000000000..78c2257b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_int8_comp_instances<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_int8_comp_instances<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
new file mode 100644
index 000000000..65c75fa04
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_instances<2,
+                                                                              NGCHW,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
new file mode 100644
index 000000000..5c425effd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp
new file mode 100644
index 000000000..4064c6634
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_int8_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
new file mode 100644
index 000000000..9f0f9371b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NGCHW,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NGKHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
new file mode 100644
index 000000000..217f57d87
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NGCHW,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NGKHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
new file mode 100644
index 000000000..f667481fa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
new file mode 100644
index 000000000..2ff2c7f51
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
new file mode 100644
index 000000000..c66d48ed7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NGCHW,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NGCHW,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
new file mode 100644
index 000000000..8bdf5f527
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
index 29034afd9..1abd4fd9f 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -58,13 +58,13 @@ using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK>,
 using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
                                        std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK>,
                                        std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK>,
-                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK>,
                                        std::tuple<float, NHWGC, GKYXC, NHWGK>,
                                        std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
                                        std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
                                        std::tuple<int8_t, NHWGC, GKYXC, NHWGK>,
                                        std::tuple<float, NGCHW, GKYXC, NGKHW>,
-                                       std::tuple<ck::half_t, NGCHW, GKYXC, NGKHW>>;
+                                       std::tuple<ck::half_t, NGCHW, GKYXC, NGKHW>,
+                                       std::tuple<int8_t, NGCHW, GKYXC, NGKHW>>;
 
 using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
                                        std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK>,
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
index 3d734fa5e..088fed89f 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
@@ -52,7 +52,8 @@ using namespace ck::tensor_layout::convolution;
 
 using KernelTypes2d = ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>,
                                        std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
-                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;
 
 using KernelTypes3d = ::testing::Types<std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
                                        std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
-- 
GitLab


From 464abd235e27c33422aa52ed2044af8fbcc3a88d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Tue, 5 Nov 2024 10:09:52 +0100
Subject: [PATCH 036/153] [generate.py] Override blob list if it already exists
 (#1635)

Before, generate.py appended the list at the end of the output file.
When running the cmake configuration steps multiple times on the
examples, the blob list (such as fwd_blob_list.txt) would grow at every
configuration.
`library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt` worked around
this issue by removing the output file if it exists.

Now, generate.py overrides the content of the output file.
There is no need for the workaround in the CMakeLists.txt;
and the issue is solved for the example projects too.
---
 example/ck_tile/01_fmha/generate.py                          | 3 +++
 example/ck_tile/02_layernorm2d/generate.py                   | 2 +-
 library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt | 5 -----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index 9b91d36fb..5b1b6664c 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -47,6 +47,9 @@ def list_blobs(output_file : Optional[str], api_list : List[str], kernel_filter
     assert output_file is not None
     file_path = Path(output_file)
 
+    # create an empty file / drop its contents if it exists
+    open(file_path, "w").close()
+
     for api in api_list:
         handler = handlers[api][HandlerId.LIST_BLOBS]
         handler(file_path, kernel_filter, receipt, mask_impl)
diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index bf576db97..09aa6b65f 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -559,7 +559,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         w_p = Path(self.working_path)
         list_p = w_p / 'layernorm2d_fwd_blobs.txt'
         blobs = self.get_blobs()
-        with list_p.open('a') as list_f:
+        with list_p.open('w') as list_f:
             # api related file
             list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
             list_f.write(str(w_p / (self.name_common_header + ".hpp"))  + "\n")
diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
index 6d638b174..a53fde166 100644
--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
@@ -27,11 +27,6 @@ rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops)
 # headers for building lib
 file(COPY ${MHA_HEADERS} DESTINATION ${FMHA_CPP_FOLDER})
 
-# Delete the blob file if it exists to avoid append of old content.
-if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt)
-    file(REMOVE ${FMHA_CPP_FOLDER}/blob_list.txt)
-endif()
-
 set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd")
 
 # generate a list of kernels, but not actually emit files at config stage
-- 
GitLab


From b6e74be1aa38396609bca91cba5f9e5f8665e4b0 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:53:10 -0800
Subject: [PATCH 037/153] Make sure cmake can handle the xnack+/xnack- targets.
 (#1633)

* make sure cmake can handle xnack targets

* dont build xdl instances for gfx906:xnack-

* dont build xdl tests for gfx906:xnack-
---
 example/CMakeLists.txt                               |  8 ++++----
 .../src/tensor_operation_instance/gpu/CMakeLists.txt | 10 +++++-----
 test/CMakeLists.txt                                  | 12 ++++++------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index ad3f7c787..22af7b2d5 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -85,9 +85,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
         elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -169,9 +169,9 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
         elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 6756c3351..c8bbd6eb0 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -88,19 +88,19 @@ function(add_instance_library INSTANCE_NAME)
         foreach(source IN LISTS ARGN)
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
             if(source MATCHES "_xdl")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
             elseif(source MATCHES "_wmma")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
             elseif(source MATCHES "mha")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
             endif()
             #only build the fp8 gemm instances for gfx908/90a if the build argument is set
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
                 endif()
                 if(source MATCHES "gemm_multiply_multiply_f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
                 endif()
             endif()
             set(offload_targets)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b12ced524..a81c5a96b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
         elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
         elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
-- 
GitLab


From d0e3a70a2e3ebb8f979c82309e3e58b5c23fe865 Mon Sep 17 00:00:00 2001
From: darren-amd <Darren.Lao@amd.com>
Date: Tue, 5 Nov 2024 12:59:08 -0500
Subject: [PATCH 038/153] Statically Cast Pointer Offset (#1631)

* explicit cast ptr offset

* formating change
---
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 12 +++++-----
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 24 +++++++++----------
 ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 12 +++++-----
 ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 24 +++++++++----------
 .../gpu/grid/gridwise_tensor_rearrange.hpp    |  8 +++----
 5 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 5e9da459c..b544c925e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -93,12 +93,12 @@ __global__ void
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
     const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index d3c0f84b9..c1f58ccda 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -60,12 +60,12 @@ __global__ void
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
     const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
@@ -117,12 +117,12 @@ __global__ void
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
     const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 65b7b6cb7..3e14f66a0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -98,12 +98,12 @@ __global__ void
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t c_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t c_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
     const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index b3b057c80..de6c9c160 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -60,12 +60,12 @@ __global__ void
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
     const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
@@ -155,12 +155,12 @@ __global__ void
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
-    const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
 
     const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
index 174074990..ddf0b4a58 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -121,10 +121,10 @@ struct GridwiseTensorRearrange
             __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
 
         // Global Memory
-        const index_t a_batch_offset =
-            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-        const index_t c_batch_offset =
-            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+        const index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+        const index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
 
         const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
-- 
GitLab


From 54440cf562b31eea6a158057fd8c41e9db1b4cc8 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 5 Nov 2024 13:56:20 -0800
Subject: [PATCH 039/153] remove gfx940;gfx941 from default target lists
 (#1640)

---
 CMakeLists.txt | 8 ++++----
 Jenkinsfile    | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74628597a..bd2f60683 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}")
 
 message("checking which targets are supported")
 #In order to build just the CK library (without tests and examples) for all supported GPU targets
-#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" 
+#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 #the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
 #
 #In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
 if(NOT ENABLE_ASAN_PACKAGING)
     if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
         # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
-        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
     else()
-        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
     endif()
 else()
     #build CK only for xnack-supported targets when using ASAN
-    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+")
+    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+")
 endif()
 
 #if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
diff --git a/Jenkinsfile b/Jenkinsfile
index 48b4c805c..b79b2045b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1101,11 +1101,11 @@ pipeline {
                     agent{ label rocmnode("gfx90a") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                         -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
                                          -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                           -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
@@ -1165,7 +1165,7 @@ pipeline {
                         execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -D CMAKE_BUILD_TYPE=Release \
-                                           -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
+                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
                                            -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
                     }
                     steps{
-- 
GitLab


From 365f39aed0d5335b6e39d5049231558128cfedd9 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Tue, 5 Nov 2024 14:58:29 -0700
Subject: [PATCH 040/153] Prevent instantiation of undefined FP8 operators.
 (#1639)

---
 .../elementwise_scale_permute_amax_2D_fp16_fp8.cpp           | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
index 7ac3c4e23..9431a8cde 100644
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
@@ -68,7 +68,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
 
 using DeviceReduceInstance =
     ck::tensor_operation::device::DeviceReduceMultiBlock<OutputDataType,
-                                                         OutputDataType,
+                                                         ScaleDataType,
                                                          OutputDataType,
                                                          NumDim,
                                                          NumDim,
@@ -108,7 +108,8 @@ void reference_scale_permute_amax(Tensor<InputDataType>& input,
             host_output_scaled_casted_transposed(m, k) = y1;
             const OutputDataType y_fabs =
                 ck::type_convert<OutputDataType>(ck::math::abs(ck::type_convert<float>(y0)));
-            host_output_amax(0) = ck::math::max(y_fabs, host_output_amax(0));
+            host_output_amax(0) = ck::type_convert<OutputDataType>(ck::math::max(
+                ck::type_convert<float>(y_fabs), ck::type_convert<float>(host_output_amax(0))));
         }
     }
 }
-- 
GitLab


From dcafb1de15a8fd1de3496f19fd806ac9cb185012 Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Wed, 6 Nov 2024 10:44:58 +0100
Subject: [PATCH 041/153] Generic threshold calculation after merge fixes
 (#1618)

* Generic threshold calculation add passing num of accums

* Generic threshold - after merge fixes

* Fix cmakelists

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 .../include/ck/library/utility/check_err.hpp   |  8 ++++----
 .../profiler/profile_pool3d_fwd_impl.hpp       | 18 ++++++++++++++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp
index 73ac2a189..88741c3b9 100644
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -24,7 +24,7 @@ namespace ck {
 namespace utils {
 
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
-double get_relative_threshold(const int numberOfAccumulations = 1)
+double get_relative_threshold(const int number_of_accumulations = 1)
 {
     using F8   = ck::f8_t;
     using F16  = ck::half_t;
@@ -79,13 +79,13 @@ double get_relative_threshold(const int numberOfAccumulations = 1)
     }
     else
     {
-        acc_error = std::pow(2, -NumericUtils<AccDataType>::mant) * 0.5 * numberOfAccumulations;
+        acc_error = std::pow(2, -NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
     }
     return std::max(acc_error, midway_error);
 }
 
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
-double get_absolute_threshold(const double max_possible_num, const int numberOfAccumulations = 1)
+double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
 {
     using F8   = ck::f8_t;
     using F16  = ck::half_t;
@@ -142,7 +142,7 @@ double get_absolute_threshold(const double max_possible_num, const int numberOfA
     else
     {
         acc_error =
-            std::pow(2, expo - NumericUtils<AccDataType>::mant) * 0.5 * numberOfAccumulations;
+            std::pow(2, expo - NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
     }
     return std::max(acc_error, midway_error);
 }
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
index a0890028a..cbdacad53 100644
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -240,6 +240,19 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
         {
             out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
 
+            auto number_of_accumulations = 1;
+            static_assert(
+                ReduceOpId == ck::ReduceTensorOp::AVG || ReduceOpId == ck::ReduceTensorOp::MAX,
+                "Warning: Unhandled ReduceOpId for setting up the number of accumulations!");
+
+            if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG)
+            {
+                for(size_t i = 0; i < kernel_params.window_spatial_lengths.size(); ++i)
+                {
+                    number_of_accumulations *= kernel_params.window_spatial_lengths.at(i);
+                }
+            }
+
             auto absolute_error_threshold = 1.0;
             switch(in_params.init_method)
             {
@@ -250,9 +263,10 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
 
             absolute_error_threshold =
                 ck::utils::get_absolute_threshold<ComputeDataType, OutDataType>(
-                    absolute_error_threshold);
+                    absolute_error_threshold, number_of_accumulations);
             auto relative_error_threshold =
-                ck::utils::get_relative_threshold<ComputeDataType, OutDataType>();
+                ck::utils::get_relative_threshold<ComputeDataType, OutDataType>(
+                    number_of_accumulations);
 
             bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
                                              out_n_c_do_ho_wo_host.mData,
-- 
GitLab


From 3599418aa8f6b19e94c09160a086030ed50c7184 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Thu, 7 Nov 2024 03:32:44 +0800
Subject: [PATCH 042/153] Fix F16 type (#1583)

---
 profiler/src/profile_layernorm_fwd.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/profiler/src/profile_layernorm_fwd.cpp b/profiler/src/profile_layernorm_fwd.cpp
index a261bd741..7031b3653 100644
--- a/profiler/src/profile_layernorm_fwd.cpp
+++ b/profiler/src/profile_layernorm_fwd.cpp
@@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[])
 
         if(data_type == ck::DataTypeEnum::Half)
         {
-            ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F32, false, rank>(
+            ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F16, false, rank>(
                 do_verification, init_method, do_log, time_kernel, length);
         }
         else if(data_type == ck::DataTypeEnum::Float)
-- 
GitLab


From 75c5bfa3642cb368acae5c7824aa7d6c506f5dae Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 7 Nov 2024 14:14:42 -0800
Subject: [PATCH 043/153] enable compilation for generic navi targets (#1645)

---
 example/CMakeLists.txt                                   | 4 ++--
 include/ck/ck.hpp                                        | 8 +++++---
 include/ck/utility/amd_wmma.hpp                          | 5 +++--
 include/ck_tile/core/config.hpp                          | 8 +++++---
 library/src/tensor_operation_instance/gpu/CMakeLists.txt | 8 ++++----
 test/CMakeLists.txt                                      | 8 ++++----
 6 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 22af7b2d5..ea739c707 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -85,7 +85,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         endif()
@@ -169,7 +169,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         endif()
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 5f74d51a6..999eb0229 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -63,13 +63,15 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #define __gfx101__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
+    defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif
 
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
index 322a0f94b..d04513f3e 100644
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -9,7 +9,8 @@
 // TODO: Add arch limitation
 namespace ck {
 
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
 /********************************WAVE32 MODE***********************************************/
@@ -260,7 +261,7 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp>
 // gfx12
 /********************************WAVE32 MODE***********************************************/
 
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif
 
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 4be50b865..604c9551f 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -11,13 +11,15 @@
 #define __gfx94__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
+    defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif
 
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index c8bbd6eb0..80f0fc306 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -88,19 +88,19 @@ function(add_instance_library INSTANCE_NAME)
         foreach(source IN LISTS ARGN)
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
             if(source MATCHES "_xdl")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             elseif(source MATCHES "_wmma")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
             elseif(source MATCHES "mha")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             endif()
             #only build the fp8 gemm instances for gfx908/90a if the build argument is set
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
                 if(source MATCHES "gemm_multiply_multiply_f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
             endif()
             set(offload_targets)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a81c5a96b..498a20dc5 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
-- 
GitLab


From 686a58a912f6884a9b66841cf04b4b81ba35aa7f Mon Sep 17 00:00:00 2001
From: dummycoderfe <felix.li@amd.com>
Date: Fri, 8 Nov 2024 12:28:23 +0800
Subject: [PATCH 044/153] [Ck tile] layernorm2d fwd optimize (#1637)

* optimze small N case using vec io and using rcp div

* [Ck_tile] layernorm, add param to control fastdiv; change generate codes and test pass

* [Ck_tile] fix blockSize compute in Generic2dBlockShape

* [Ck_tile]fix kfastfdiv template style

* [Ck_tile] layernorm, fix stype in review

---------

Co-authored-by: dummycoderfe <noplydummmycoder@163.com>
---
 example/ck_tile/02_layernorm2d/generate.py    | 105 ++++++++++--------
 .../ops/common/generic_2d_block_shape.hpp     |  12 +-
 ...ayernorm2d_fwd_pipeline_default_policy.hpp |  12 +-
 .../layernorm2d_fwd_pipeline_one_pass.hpp     |  11 +-
 .../pipeline/layernorm2d_fwd_traits.hpp       |   2 +
 .../ops/welford/block/block_welford.hpp       |  34 ++++--
 .../welford/block/block_welford_problem.hpp   |   9 +-
 .../ops/welford/thread/thread_welford.hpp     |  43 +++++--
 8 files changed, 144 insertions(+), 84 deletions(-)

diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index 09aa6b65f..ca9e432a4 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -57,6 +57,7 @@ template <typename XDataType_,
           ck_tile::index_t Vector_N_,         // vector size along N
           bool kPadN_,
           bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
           bool kTwoPass_,
           ck_tile::index_t kFusedAdd_ = 0,
           ck_tile::index_t kFusedQuant_ = 0>
@@ -118,6 +119,7 @@ struct layernorm2d_fwd_traits_
 
     static constexpr bool kPadN           = kPadN_;
     static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kFastFDiv       = kFastFDiv_;
     static constexpr bool kTwoPass        = kTwoPass_;
     static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
     static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
@@ -134,6 +136,7 @@ template <typename XDataType_,
           ck_tile::index_t Vector_N_,         // vector size along N
           bool kPadN_,
           bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
           bool kTwoPass_,
           int  kFusedAdd_,
           int  kFusedQuant_>
@@ -148,6 +151,7 @@ using traits_ = layernorm2d_fwd_traits_<XDataType_,
                                        Vector_N_,
                                        kPadN_,
                                        kSaveMeanInvStd_,
+                                       kFastFDiv_,
                                        kTwoPass_,
                                        kFusedAdd_,
                                        kFusedQuant_>;
@@ -179,6 +183,7 @@ float layernorm2d_fwd_(const S& s, A a)
 
     using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
         Traits_::kSaveMeanInvStd,
+        Traits_::kFastFDiv,
         Traits_::kTwoPass,
         static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
         static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
@@ -269,7 +274,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
 #include "layernorm2d_fwd_api_common.hpp"
 
 // clang-format off
-//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv     2p      add  sweep
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf    2p      add  sweep
 {F_instance_def}
 // clang-format on
 
@@ -356,6 +361,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         F_Vector_N : int
         F_kPadN : bool
         F_kSaveMeanInvStd_ : bool
+        F_kFastFDiv_ : bool
         F_kTwoPass_ : bool
         F_kFusedAdd : int
         F_kFusedQuant : int
@@ -363,7 +369,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         @property
         def trait_name(self) ->str:
             t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
-            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}'
             t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
             return t_
 
@@ -483,52 +489,55 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         fused_add_list = [0, 1]
         fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
 
-        #                                                       rm  rn  tm   tn  vn  pd     mv     2p     add    sweep
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, False,   0,    0)],
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False,  True,   0,    0)]}
+        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  2p     add    sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True,  True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True,  True,   0,    0)]}
         total_blob = list()
         for hs_key in h_trait_dict:
             hs = h_trait_dict[hs_key]
diff --git a/include/ck_tile/ops/common/generic_2d_block_shape.hpp b/include/ck_tile/ops/common/generic_2d_block_shape.hpp
index 64ad20c3b..c0bfd9319 100644
--- a/include/ck_tile/ops/common/generic_2d_block_shape.hpp
+++ b/include/ck_tile/ops/common/generic_2d_block_shape.hpp
@@ -38,9 +38,7 @@ namespace ck_tile {
 template <typename BlockTile_,    // block size, seq<M, N>
           typename WarpPerBlock_, // num warps along seq<M, N>
           typename WarpTile_,     // warp size, seq<M, N>
-          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
-          index_t BlockSize_ =
-              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
+          typename Vector_>       // contiguous pixels(vector size) along seq<M, N>)>
 struct Generic2dBlockShape
 {
     // block size
@@ -68,10 +66,12 @@ struct Generic2dBlockShape
     static_assert(Warp_M % Vector_M == 0);
     static_assert(Warp_N % Vector_N == 0);
     // num of threads along seq<M, N>, within each warp
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+    static constexpr index_t ThreadPerWarp_M  = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N  = Warp_N / Vector_N;
+    static constexpr index_t ThreadPerBlock_M = Block_M / Repeat_M / Vector_M;
+    static constexpr index_t ThreadPerBlock_N = Block_N / Repeat_N / Vector_N;
 
-    static constexpr index_t BlockSize = BlockSize_;
+    static constexpr index_t BlockSize = ThreadPerBlock_M * ThreadPerBlock_N;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
index 1de230c14..724f6261d 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
@@ -47,7 +47,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     {
         using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                        typename Problem::ComputeDataType,
-                                       typename Problem::BlockShape>;
+                                       typename Problem::BlockShape,
+                                       Problem::Traits::kFastFDiv>;
 
         return BlockWelford<P_>{};
     }
@@ -57,7 +58,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     {
         using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                        typename Problem::ComputeDataType,
-                                       typename Problem::BlockShape>;
+                                       typename Problem::BlockShape,
+                                       Problem::Traits::kFastFDiv>;
 
         return BlockWelfordSync<P_>{};
     }
@@ -67,7 +69,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     {
         using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                        typename Problem::ComputeDataType,
-                                       typename Problem::BlockShape>;
+                                       typename Problem::BlockShape,
+                                       Problem::Traits::kFastFDiv>;
 
         return BlockWelfordCrossWarpSync<P_>{};
     }
@@ -79,7 +82,8 @@ struct Layernorm2dFwdPipelineDefaultPolicy
         {
             using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
                                            typename Problem::ComputeDataType,
-                                           typename Problem::BlockShape>;
+                                           typename Problem::BlockShape,
+                                           Problem::Traits::kFastFDiv>;
 
             using block_welford = BlockWelford<P_>;
             using x_block_tile =
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index 83cdab428..4b83ed4fb 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -36,6 +36,7 @@ struct Layernorm2dFwdPipelineOnePass
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
     static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
     static constexpr bool kPadN              = Problem::Traits::kPadN;
+    static constexpr bool kFastFDiv          = Problem::Traits::kFastFDiv;
     static constexpr auto kFusedAdd          = Problem::Traits::kFusedAdd;
     static constexpr auto kFusedQuant        = Problem::Traits::kFusedQuant;
 
@@ -125,7 +126,15 @@ struct Layernorm2dFwdPipelineOnePass
         // compute inv-std
         auto inv_std = tile_elementwise_in(
             [&](const auto& v_) {
-                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ + epsilon));
+                if(kFastFDiv && std::is_same_v<ComputeDataType, float>)
+                {
+                    return type_convert<ComputeDataType>(1.0f) *
+                           __builtin_amdgcn_rcpf(sqrt(v_ + epsilon));
+                }
+                else
+                {
+                    return type_convert<ComputeDataType>(1.0f) / sqrt(v_ + epsilon);
+                }
             },
             var);
 
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
index ed9e18be3..e8c22f8ab 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
@@ -39,6 +39,7 @@ template<> struct Layernorm2dFusedQuantEnumName<Layernorm2dFusedQuantEnum::SMOOT
 
 template <bool kPadN_,
           bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
           bool kTwoPass_,
           Layernorm2dFusedAddEnum kFusedAdd_,
           Layernorm2dFusedQuantEnum kFusedQuant_>
@@ -46,6 +47,7 @@ struct Layernorm2dFwdTraits
 {
     static constexpr bool kPadN                            = kPadN_;
     static constexpr bool kSaveMeanInvStd                  = kSaveMeanInvStd_;
+    static constexpr bool kFastFDiv                        = kFastFDiv_;
     static constexpr bool kTwoPass                         = kTwoPass_;
     static constexpr Layernorm2dFusedAddEnum kFusedAdd     = kFusedAdd_;
     static constexpr Layernorm2dFusedQuantEnum kFusedQuant = kFusedQuant_;
diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp
index ce73c183e..968895e38 100644
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
@@ -11,9 +11,10 @@ namespace ck_tile {
 template <typename Problem_, typename Policy_ = void>
 struct BlockWelford
 {
-    using Problem         = remove_cvref_t<Problem_>;
-    using XDataType       = typename Problem::XDataType;
-    using ComputeDataType = typename Problem::ComputeDataType;
+    using Problem                   = remove_cvref_t<Problem_>;
+    using XDataType                 = typename Problem::XDataType;
+    using ComputeDataType           = typename Problem::ComputeDataType;
+    static constexpr bool kFastFDiv = Problem::kFastFDiv;
 
     CK_TILE_DEVICE constexpr BlockWelford() {}
 
@@ -89,7 +90,8 @@ struct BlockWelford
 template <typename Problem_, typename Policy_ = void>
 struct BlockWelfordSync
 {
-    using Problem = remove_cvref_t<Problem_>;
+    using Problem                   = remove_cvref_t<Problem_>;
+    static constexpr bool kFastFDiv = Problem::kFastFDiv;
 
     template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
     CK_TILE_DEVICE void
@@ -173,8 +175,9 @@ struct BlockWelfordSync
 template <typename Problem_, typename Policy_ = void>
 struct BlockWelfordCrossWarpSync
 {
-    using Problem    = remove_cvref_t<Problem_>;
-    using BlockShape = typename Problem::BlockShape;
+    using Problem                   = remove_cvref_t<Problem_>;
+    using BlockShape                = typename Problem::BlockShape;
+    static constexpr bool kFastFDiv = Problem::kFastFDiv;
 
     template <typename MeanDistributedTensor_>
     CK_TILE_DEVICE static constexpr index_t GetReduceWarps()
@@ -351,12 +354,23 @@ CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_
 }
 
 // Note: this function must be called after all the computation
-template <typename VarDistributedTensor_>
+template <typename VarDistributedTensor_, bool FastFdiv_ = false>
 CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTensor_& var_tensor,
-                                                                int count)
+                                                                int count,
+                                                                bool_constant<FastFdiv_> = {})
 {
     using DataType = typename VarDistributedTensor_::DataType;
-    tile_elementwise_inout([&count](auto& x) { x = x / type_convert<DataType>(count); },
-                           var_tensor);
+    tile_elementwise_inout(
+        [&count](auto& x) {
+            if(FastFdiv_ && std::is_same_v<DataType, float>)
+            {
+                x = x * __builtin_amdgcn_rcpf(type_convert<DataType>(count));
+            }
+            else
+            {
+                x = x / type_convert<DataType>(count);
+            }
+        },
+        var_tensor);
 }
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/block/block_welford_problem.hpp b/include/ck_tile/ops/welford/block/block_welford_problem.hpp
index dcae1ef2e..bcbfb7d76 100644
--- a/include/ck_tile/ops/welford/block/block_welford_problem.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford_problem.hpp
@@ -7,12 +7,13 @@
 
 namespace ck_tile {
 
-template <typename XDataType_, typename ComputeDataType_, typename BlockShape_>
+template <typename XDataType_, typename ComputeDataType_, typename BlockShape_, bool kFastFDiv_>
 struct BlockWelfordProblem
 {
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using XDataType                 = remove_cvref_t<XDataType_>;
+    using ComputeDataType           = remove_cvref_t<ComputeDataType_>;
+    using BlockShape                = remove_cvref_t<BlockShape_>;
+    static constexpr bool kFastFDiv = kFastFDiv_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/welford/thread/thread_welford.hpp
index 4c61cdcf4..52b253e5f 100644
--- a/include/ck_tile/ops/welford/thread/thread_welford.hpp
+++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp
@@ -7,25 +7,46 @@
 
 namespace ck_tile {
 
-template <typename T>
-CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count)
+template <typename T, bool kFastFDiv = false>
+CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count, bool_constant<kFastFDiv> = {})
 {
     // TODO: check nan? maybe no
     T delta = x - mean;
-    mean += delta / count;
+    if(kFastFDiv && std::is_same_v<T, float>)
+    {
+        mean += delta * __builtin_amdgcn_rcpf(count);
+    }
+    else
+    {
+        mean += delta / count;
+    }
     T delta2 = x - mean;
     var += delta * delta2;
 }
 
-template <typename T>
-CK_TILE_DEVICE static void
-welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+template <typename T, bool kFastFDiv = false>
+CK_TILE_DEVICE static void welford_merge(T& mean_a,
+                                         T& var_a,
+                                         int& count_a,
+                                         T mean_b,
+                                         T var_b,
+                                         int count_b,
+                                         bool_constant<kFastFDiv> = {})
 {
-    int count            = count_a + count_b;
-    T count_             = type_convert<T>(count);
-    T count_a_           = type_convert<T>(count_a);
-    T count_b_           = type_convert<T>(count_b);
-    T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
+    int count  = count_a + count_b;
+    T count_   = type_convert<T>(count);
+    T count_a_ = type_convert<T>(count_a);
+    T count_b_ = type_convert<T>(count_b);
+    T count_b_over_count;
+    if(kFastFDiv && std::is_same_v<T, float>)
+    {
+        count_b_over_count =
+            count == 0 ? type_convert<T>(0) : count_b_ * __builtin_amdgcn_rcpf(count_);
+    }
+    else
+    {
+        count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
+    }
 
     T delta = mean_b - mean_a;
     mean_a += delta * count_b_over_count;
-- 
GitLab


From ea3640fdea4b11178c1657feff4849ad011e5d26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 8 Nov 2024 10:04:33 +0100
Subject: [PATCH 045/153] Add generic instances for two stage conv bwd wei
 (#1643)

* Add generic instances for two stage conv bwd wei

* Update layout prefix
---
 ...conv_bwd_weight_two_stage_xdl_instance.hpp |  76 ++++++++++++-
 .../grouped_convolution_backward_weight.hpp   |  16 +++
 ...rouped_convolution_backward_weight_xdl.inc | 100 ++++++++++++++++++
 .../grouped_conv2d_bwd_weight/CMakeLists.txt  |   4 +
 ...ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp |  41 +++++++
 ..._ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp |  41 +++++++
 ...nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp |  41 +++++++
 ...nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp |   2 +-
 ...nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp |   2 +-
 ..._nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp |  41 +++++++
 ..._nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp |   2 +-
 ..._nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp |   2 +-
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  |   4 +
 ...wgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp |  41 +++++++
 ...wgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp |   2 +-
 ...wgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp |   2 +-
 ...hwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp |  41 +++++++
 ...hwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp |   2 +-
 ...hwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp |   2 +-
 ...dhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp |  41 +++++++
 ...cdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp |  41 +++++++
 21 files changed, 534 insertions(+), 10 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index 5f6c340e4..d82f82cce 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -39,7 +39,25 @@ template <ck::index_t NDimSpatial,
           ConvolutionBackwardWeightSpecialization ConvSpec,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
-using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances = std::tuple<
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
+        // clang-format on
+        >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances = std::tuple<
     // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
@@ -64,7 +82,25 @@ template <ck::index_t NDimSpatial,
           ConvolutionBackwardWeightSpecialization ConvSpec,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
-using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = std::tuple<
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
+        // clang-format on
+        >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances = std::tuple<
     // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
@@ -82,6 +118,24 @@ using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = st
     // clang-format on
     >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>
+        // clang-format on
+        >;
+
 // NGCHW requires transpose, we use vector loads and stores params for them
 template <ck::index_t NDimSpatial,
           typename ALayout,
@@ -122,6 +176,24 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances
     // clang-format on
     >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>
+        // clang-format on
+        >;
+
 template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index db17f0f38..33027322e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -352,6 +352,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instances(
@@ -375,6 +377,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances(
@@ -390,6 +394,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
                              is_same_v<ComputeTypeB, half_t>)
                 {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
@@ -403,6 +409,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<ComputeTypeA, ck::bhalf_t> &&
                              is_same_v<ComputeTypeB, ck::bhalf_t>)
                 {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
@@ -464,6 +472,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instances(
@@ -487,6 +497,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances(
@@ -511,6 +523,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
                              is_same_v<ComputeTypeB, half_t>)
                 {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
@@ -524,6 +538,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<ComputeTypeA, ck::bhalf_t> &&
                              is_same_v<ComputeTypeB, ck::bhalf_t>)
                 {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
index 132dde81a..630eb8135 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -113,6 +113,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -136,6 +148,19 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
@@ -173,6 +198,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -196,6 +233,19 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
@@ -298,6 +348,18 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
@@ -321,6 +383,19 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
@@ -358,6 +433,18 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
@@ -381,6 +468,19 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
index 546a62a8a..05eaf9067 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -15,6 +15,10 @@ set(GROUPED_CONV2D_BWD_WEIGHT
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
     )
 
 if(DL_KERNELS)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
new file mode 100644
index 000000000..0b429af83
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+            2,
+            NGCHW,
+            GKYXC,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
new file mode 100644
index 000000000..d70c95bf6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+            2,
+            NGCHW,
+            GKYXC,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
new file mode 100644
index 000000000..74ccc4c89
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
index 0e4d085de..fab289855 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances<
             2,
             NHWGC,
             GKYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
index 680494cfd..407645e89 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances<
             2,
             NHWGC,
             GKYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
new file mode 100644
index 000000000..807de66ca
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
index 15401f0e1..084c83cd6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances<
             2,
             NHWGC,
             GKYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
index 398c14b11..d174e5b6c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances<
             2,
             NHWGC,
             GKYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index c8c30897c..cf4e323bf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -15,6 +15,10 @@ set(GROUPED_CONV3D_BWD_WEIGHT
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
     )
 
 if(DL_KERNELS)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
new file mode 100644
index 000000000..63249a1c1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
index 549716586..7841ddad9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances<
             3,
             NDHWGC,
             GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
index 18a00c6ea..ba6285a38 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances<
             3,
             NDHWGC,
             GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
new file mode 100644
index 000000000..a8fbefb5b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
index 4d0f1e68c..e4baafc0b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances<
             3,
             NDHWGC,
             GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
index c5cc062f2..f9bc5b134 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances<
             3,
             NDHWGC,
             GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
new file mode 100644
index 000000000..16221eb3e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+            3,
+            NGCDHW,
+            GKZYXC,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
new file mode 100644
index 000000000..126e90f2c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+            3,
+            NGCDHW,
+            GKZYXC,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
-- 
GitLab


From af9546d9f4dba6945e23e1c346f92678f0f208f9 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Sat, 9 Nov 2024 09:55:14 +0800
Subject: [PATCH 046/153] Fix 'sh' command compatibility of smoke_test_fwd.sh
 (#1553)

---
 .../ck_tile/01_fmha/script/smoke_test_fwd.sh  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index 5dcc6ed42..b867cd6c0 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -29,14 +29,14 @@ while getopts ":sa" opt; do
 done
 
 run_fp16_bf16_tests() {
-    local NUM_SPLITS=(1)
-    local PAGE_BLOCK_SIZE=(0)
-    local CACHE_BATCH_IDX=(0)
+    local NUM_SPLITS="1"
+    local PAGE_BLOCK_SIZE="0"
+    local CACHE_BATCH_IDX="0"
 
     if [ $TEST_SPLITKV -eq 1 ] ; then
-        NUM_SPLITS+=(2 3)
-        PAGE_BLOCK_SIZE+=(128)
-        CACHE_BATCH_IDX+=(1)
+        NUM_SPLITS="$NUM_SPLITS 2 3"
+        PAGE_BLOCK_SIZE="$PAGE_BLOCK_SIZE 128"
+        CACHE_BATCH_IDX="$CACHE_BATCH_IDX 1"
     fi
 
     for prec in "fp16" "bf16" ; do
@@ -47,9 +47,9 @@ run_fp16_bf16_tests() {
     for lse in 0 1 ; do
     for bias in "n" "e" "a" ; do
     for p_drop in 0.0 0.2 ; do
-    for num_splits in "${NUM_SPLITS[@]}" ; do
-    for page_block_size in "${PAGE_BLOCK_SIZE[@]}" ; do
-    for cache_batch_idx in "${CACHE_BATCH_IDX[@]}" ; do
+    for num_splits in $NUM_SPLITS ; do
+    for page_block_size in $PAGE_BLOCK_SIZE ; do
+    for cache_batch_idx in $CACHE_BATCH_IDX ; do
 
     # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
     $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
@@ -103,4 +103,4 @@ if [ $TEST_APPENDKV -eq 1 ] ; then
     run_fp16_appendkv_tests
 fi
 
-set +x
\ No newline at end of file
+set +x
-- 
GitLab


From bec6fbc65fe766ab23fe563675703defdb0dd2be Mon Sep 17 00:00:00 2001
From: dummycoderfe <felix.li@amd.com>
Date: Sat, 9 Nov 2024 17:57:27 +0800
Subject: [PATCH 047/153] Ck tile/moe sorting (#1624)

* add moe_sorting & check ok

* fix comments & typo

* Run remod.py under include/ck_tile & example/ck_tile directories

* format codes

* fix output ci check bug

* fix moe sorting readme and error commit file

* use magiv div to accelerate compute

* add an loop unroll for moe lds ops

* add extblocksnel to set zeros for moebufs

* [Ck_tile] moe set zero run ok, add size check and fix ref check

* [Ck_tile]fix moe_sorting fuse set_zero remod

* [Ck_tile] change name style, fix zero buffer size err, change folder

* [Ck_tile] moe_sorting: fix name style

* [Ck_tile] moe_sorting, remove useless params in traits

* [Ck_tile] change outputtile cnt * unit_size; change output buf alloc

---------

Co-authored-by: dummycoderfe <noplydummmycoder@163.com>
Co-authored-by: Po Yen, Chen <PoYen.Chen@amd.com>
Co-authored-by: carlushuang <carlus.huang@amd.com>
---
 example/ck_tile/13_moe_sorting/CMakeLists.txt |   8 +
 example/ck_tile/13_moe_sorting/README.md      |  27 ++
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    | 223 +++++++++++++++++
 .../13_moe_sorting/moe_sorting_api.cpp        |  73 ++++++
 .../13_moe_sorting/moe_sorting_api.hpp        |  20 ++
 .../13_moe_sorting/script/smoke_test.sh       |  19 ++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/host.hpp                      |   1 +
 .../host/reference/reference_moe_sorting.hpp  |  78 ++++++
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 232 ++++++++++++++++++
 .../pipeline/moe_sorting_pipeline.hpp         |  39 +++
 .../fused_moe/pipeline/moe_sorting_policy.hpp |  15 ++
 .../pipeline/moe_sorting_problem.hpp          |  23 ++
 include/ck_tile/ops/moe_sorting.hpp           |  11 +
 14 files changed, 770 insertions(+)
 create mode 100644 example/ck_tile/13_moe_sorting/CMakeLists.txt
 create mode 100644 example/ck_tile/13_moe_sorting/README.md
 create mode 100644 example/ck_tile/13_moe_sorting/moe_sorting.cpp
 create mode 100644 example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
 create mode 100644 example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
 create mode 100644 example/ck_tile/13_moe_sorting/script/smoke_test.sh
 create mode 100644 include/ck_tile/host/reference/reference_moe_sorting.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp
 create mode 100644 include/ck_tile/ops/moe_sorting.hpp

diff --git a/example/ck_tile/13_moe_sorting/CMakeLists.txt b/example/ck_tile/13_moe_sorting/CMakeLists.txt
new file mode 100644
index 000000000..09f3e4ac4
--- /dev/null
+++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp)
+target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+target_compile_options(tile_example_moe_sorting PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS})
diff --git a/example/ck_tile/13_moe_sorting/README.md b/example/ck_tile/13_moe_sorting/README.md
new file mode 100644
index 000000000..7b6792dd9
--- /dev/null
+++ b/example/ck_tile/13_moe_sorting/README.md
@@ -0,0 +1,27 @@
+# moe-sorting
+
+This folder contains example for moe-sorting kernel using ck_tile tile-programming implementation. This kernel is often used in Moe model, before launching the fused-moe-gemm block. The input&weight is a `token*topk` 2d matrix. The op rearange the input weight ids into different experts and feed into fuse moe gemm kernel.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_moe_sorting -j
+```
+This will result in an executable `build/bin/tile_example_moe_sorting`
+
+## example
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -pr_i    index data type. (currently only fp32 supported now) (default:int32)
+       -pr_w    output weight data type(currently only fp32 supported now) (default:fp32)
+          -t    number of input tokens (default:32)
+          -e    number of experts (default:8)
+          -k    topk (default:2)
+       -st_i    row stride of input, -1 means same as experts (default:-1)
+       -seed    seed to be used, -1 means random every time (default:-1)
+      -kname    when set to 1 it will print kernel name (default:0)
+
+```
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
new file mode 100644
index 000000000..d2c4df105
--- /dev/null
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <set>
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "moe_sorting_api.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("pr_i", "int32", "index data type. (currently only int32 supported now)")
+        .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
+        .insert("t", "128", "number of input tokens")
+        .insert("e", "8", "number of num_experts")
+        .insert("k", "4", "topk")
+        .insert("unit", "32", "unit_size")
+        .insert("moe_buf_size", "0", "moe_buf_size")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "when set to 1 it will print kernel name")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+template <typename WeightType, typename IndexType = ck_tile::index_t>
+bool test_moe_sorting(ck_tile::ArgParser args)
+{
+    int validate            = args.get_int("v");
+    std::string index_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+    int tokens              = args.get_int("t");
+    int num_experts         = args.get_int("e");
+    int topk                = args.get_int("k");
+    int seed                = args.get_int("seed");
+    int unit_size           = args.get_int("unit");
+    int moe_buf_size        = args.get_int("moe_buf_size");
+    int kname               = args.get_int("kname");
+    int warmup              = args.get_int("warmup");
+    int repeat              = args.get_int("repeat");
+    int max_output_ids =
+        ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    if(topk > num_experts)
+    {
+        printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n",
+               topk,
+               num_experts);
+        return false;
+    }
+
+    // tokens already considered batch size
+    ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
+    ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
+    ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
+    ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
+    ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
+    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({1}, {1});
+    ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
+
+    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
+    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
+    topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
+
+    ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_weights_dev(sorted_weights_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_expert_ids_dev(
+        sorted_expert_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
+
+    topk_ids_dev.ToDevice(topk_ids_host.data());
+    weights_dev.ToDevice(weights_host.data());
+    if(moe_buf_size > 0)
+    {
+        moe_buf_dev.ToDevice(moe_buf_host.data());
+    }
+
+    moe_sorting_trait trait{index_prec, weight_prec};
+
+    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
+                          weights_dev.GetDeviceBuffer(),
+                          sorted_ids_dev.GetDeviceBuffer(),
+                          sorted_weights_dev.GetDeviceBuffer(),
+                          sorted_expert_ids_dev.GetDeviceBuffer(),
+                          sorted_id_cnt_dev.GetDeviceBuffer(),
+                          moe_buf_size > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          tokens,
+                          unit_size,
+                          num_experts,
+                          topk,
+                          static_cast<ck_tile::index_t>(moe_buf_size * sizeof(float))};
+
+    ck_tile::stream_config sc{nullptr,
+                              true,
+                              /* log_level = */ (kname ? 1 : 0),
+                              warmup,
+                              repeat};
+    auto ms = moe_sorting(trait, karg, sc);
+    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d,  ms:%f , ",
+           index_prec.c_str(),
+           weight_prec.c_str(),
+           tokens,
+           num_experts,
+           topk,
+           ms);
+    if(ms < 0)
+        printf("not supported\n");
+    fflush(stdout);
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    sorted_ids_dev.FromDevice(sorted_ids_host.data());
+    sorted_weights_dev.FromDevice(sorted_weights_host.data());
+    sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
+    sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
+    if(moe_buf_size > 0)
+    {
+        moe_buf_dev.FromDevice(moe_buf_host.data());
+    }
+
+    bool rtn = true;
+    if(validate)
+    {
+        ck_tile::HostTensor<IndexType> sorted_ids_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<WeightType> sorted_weights_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<IndexType> sorted_expert_ids_ref({max_output_ids / unit_size}, {1});
+
+        int32_t ref_total_tokens_post_pad = 0;
+        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
+                                                              weights_host,
+                                                              sorted_ids_ref,
+                                                              sorted_weights_ref,
+                                                              sorted_expert_ids_ref,
+                                                              ref_total_tokens_post_pad,
+                                                              num_experts,
+                                                              unit_size);
+        rtn &= ck_tile::check_err(
+            sorted_ids_host, sorted_ids_ref, std::string("OUT Error: Incorrect ids!"), 1e-6, 1e-6);
+        rtn &= ck_tile::check_err(sorted_weights_host,
+                                  sorted_weights_ref,
+                                  std::string("OUT Error: Incorrect w!"),
+                                  1e-6,
+                                  1e-6);
+        rtn &= ck_tile::check_err(sorted_expert_ids_host,
+                                  sorted_expert_ids_ref,
+                                  std::string("OUT Error: Incorrect eid!"),
+                                  1e-6,
+                                  1e-6);
+        if(moe_buf_size)
+        {
+            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
+            rtn &= ck_tile::check_err(
+                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
+        }
+        rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
+    }
+
+    printf("valid:%s\n", rtn ? "y" : "n");
+    fflush(stdout);
+    return rtn;
+}
+
+int main(int argc, char** argv)
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return -1;
+    std::string index_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+
+    bool r = true;
+    if(weight_prec.compare("fp32") == 0 && index_prec.compare("int32") == 0)
+    {
+        r &= test_moe_sorting<float, ck_tile::index_t>(args);
+    }
+    return r ? 0 : -1;
+}
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
new file mode 100644
index 000000000..25e99c530
--- /dev/null
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_sorting_api.hpp"
+
+#define MOE_SORTING_DISPATCH(unroll_num_)                                                   \
+    constexpr ck_tile::index_t unroll_num = unroll_num_;                                    \
+    using ms_problem     = ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num>; \
+    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                           \
+    auto kargs           = kernel::MakeKargs(a);                                            \
+    const dim3 grids     = kernel::GridSize(a);                                             \
+    const dim3 blocks    = kernel::BlockSize(a);                                            \
+    const auto lds_bytes = kernel::GetSmemSize(a);                                          \
+    float ave_time       = ck_tile::launch_kernel(                                          \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));          \
+    return ave_time;
+
+float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
+{
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        if(a.num_experts > 127)
+        {
+            printf("lds size exceed, only support experts <127 \n");
+            return -1;
+        }
+        if(a.moe_buf_bytes % 16)
+        {
+            printf("buf set size %d unaligned, must be multiple of 16\n", a.moe_buf_bytes);
+            return -1;
+        }
+        using index_t              = ck_tile::index_t;
+        using ms_weight_type       = float;
+        index_t smem_io_unroll_num = ck_tile::integer_divide_ceil(a.tokens * a.topk, 64);
+        switch(smem_io_unroll_num)
+        {
+        case(1): {
+            MOE_SORTING_DISPATCH(1);
+        }
+        case(2): {
+            MOE_SORTING_DISPATCH(2);
+        }
+        case(3): {
+            MOE_SORTING_DISPATCH(3);
+        }
+        case(5): {
+            MOE_SORTING_DISPATCH(5);
+        }
+        case(6): {
+            MOE_SORTING_DISPATCH(6);
+        }
+        case(7): {
+            MOE_SORTING_DISPATCH(7);
+        }
+        case(8): {
+            MOE_SORTING_DISPATCH(8);
+        }
+        case(9): {
+            MOE_SORTING_DISPATCH(9);
+        }
+        case(10): {
+            MOE_SORTING_DISPATCH(10);
+        }
+        case(11): {
+            MOE_SORTING_DISPATCH(11);
+        }
+        default: {
+            MOE_SORTING_DISPATCH(4);
+        }
+        }
+    }
+    return -1;
+}
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
new file mode 100644
index 000000000..91b54932c
--- /dev/null
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/moe_sorting.hpp"
+
+struct moe_sorting_trait
+{
+    std::string index_type;
+    std::string weight_type; // currently always float
+};
+
+struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
+{
+};
+
+float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
diff --git a/example/ck_tile/13_moe_sorting/script/smoke_test.sh b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
new file mode 100644
index 000000000..1fc5eafcb
--- /dev/null
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
@@ -0,0 +1,19 @@
+# #!/bin/sh
+
+EXE=./build/bin/tile_example_moe_sorting
+
+$EXE -t=80 -e=17 -moe_buf_size=16
+$EXE -t=111 -e=117 -moe_buf_size=4
+$EXE -t=1000 -e=55 -moe_buf_size=1024
+$EXE -t=99 -e=120  -moe_buf_size=10244
+$EXE -t=175 -e=64 -k=8
+$EXE -t=65 -e=8 -k=2
+$EXE -t=1 -e=25
+$EXE -t=31 -e=19 -k=15
+$EXE -t=81 -e=37 -k=7
+$EXE -t=23 -e=1 -k=1
+$EXE -t=127 -e=99 -k=19
+$EXE -t=71 -e=11 -k=11
+$EXE -t=1 -e=1 -k=1
+$EXE -t=99 -e=2 -k=1
+$EXE -t=333 -e=99 -k=13
\ No newline at end of file
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 9dd9a6ca3..15db0f46c 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -12,3 +12,4 @@ add_subdirectory(09_topk_softmax)
 add_subdirectory(10_rmsnorm2d)
 add_subdirectory(11_add_rmsnorm2d_rdquant)
 add_subdirectory(12_smoothquant)
+add_subdirectory(13_moe_sorting)
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index c0ab13ce3..2e96009ac 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -23,6 +23,7 @@
 #include "ck_tile/host/reference/reference_gemm.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
+#include "ck_tile/host/reference/reference_moe_sorting.hpp"
 #include "ck_tile/host/reference/reference_permute.hpp"
 #include "ck_tile/host/reference/reference_reduce.hpp"
 #include "ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp"
diff --git a/include/ck_tile/host/reference/reference_moe_sorting.hpp b/include/ck_tile/host/reference/reference_moe_sorting.hpp
new file mode 100644
index 000000000..c8eb7edb5
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <typename WeightType, typename IndexType = index_t>
+CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
+                                        const HostTensor<WeightType>& weights,
+                                        HostTensor<IndexType>& p_sorted_token_ids,
+                                        HostTensor<WeightType>& sorted_weight,
+                                        HostTensor<IndexType>& sorted_expert_ids,
+                                        index_t& unit_cnt,
+                                        const index_t experts,
+                                        const index_t unit_size)
+{
+    const index_t num_token = topk_ids.mDesc.get_lengths()[0];
+    const index_t topk      = topk_ids.mDesc.get_lengths()[1];
+    std::vector<std::vector<IndexType>> expert_tokens(experts,
+                                                      std::vector<IndexType>(unit_size, num_token));
+    std::vector<std::vector<WeightType>> expert_token_weights(
+        experts, std::vector<WeightType>(unit_size, 0));
+    std::vector<IndexType> expert_slices(experts, 1);
+    std::vector<IndexType> expert_slice_idxs(experts, 0);
+
+    for(index_t t = 0; t < num_token; t++)
+    {
+        for(index_t k = 0; k < topk; k++)
+        {
+            IndexType e  = topk_ids(t, k);
+            WeightType w = weights(t, k);
+            index_t idx  = expert_slice_idxs[e];
+            if(idx > expert_slices[e] * unit_size - 1)
+            {
+                expert_slices[e]++;
+                index_t new_size = expert_slices[e] * unit_size;
+                expert_tokens[e].resize(new_size);
+                expert_token_weights[e].resize(new_size);
+                for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
+                {
+                    expert_tokens[e][i]        = num_token;
+                    expert_token_weights[e][i] = 0;
+                }
+            }
+
+            expert_tokens[e][idx]        = t;
+            expert_token_weights[e][idx] = w;
+            expert_slice_idxs[e]++;
+        }
+    }
+
+    IndexType* out_tokens    = p_sorted_token_ids.data();
+    WeightType* out_weights  = sorted_weight.data();
+    IndexType* out_expert_id = sorted_expert_ids.data();
+    for(index_t e = 0; e < experts; e++)
+    {
+        memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
+        out_tokens += expert_slices[e] * unit_size;
+        memcpy(out_weights,
+               expert_token_weights[e].data(),
+               sizeof(WeightType) * expert_slices[e] * unit_size);
+        out_weights += expert_slices[e] * unit_size;
+
+        for(index_t s = 0; s < expert_slices[e]; s++)
+        {
+            out_expert_id[s] = e;
+            unit_cnt++;
+        }
+        out_expert_id += expert_slices[e];
+    }
+    unit_cnt *= unit_size;
+    return;
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
new file mode 100644
index 000000000..1c6acec70
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+struct MoeSortingHostArgs
+{
+    const void* p_topk_ids;
+    const void* p_weights;
+    void* p_sorted_token_ids;
+    void* p_sorted_weights;
+    void* p_sorted_expert_ids;
+    void* p_total_tokens_post_pad;
+    void* p_moe_buf;
+    index_t tokens;
+    index_t unit_size;
+    index_t num_experts;
+    index_t topk;
+    index_t moe_buf_bytes;
+};
+
+template <typename Problem_>
+struct MoeSortingKernel
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+
+    struct Kargs
+    {
+        const void* p_topk_ids;
+        const void* p_weights;
+        void* p_sorted_token_ids;
+        void* p_sorted_weights;
+        void* p_sorted_expert_ids;
+        void* p_total_tokens_post_pad;
+        void* p_moe_buf;
+        index_t tokens;
+        index_t num_experts;
+        index_t moe_buf_bytes;
+
+        index_t tokens_per_thread;
+        mdiv unit_size_mdiv;
+        mdiv topk_mdiv;
+    };
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        // TODO: assume num-experts not too much
+        return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BlockSize(h).x * 16));
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs& h)
+    {
+        return dim3(ck_tile::integer_least_multiple(h.num_experts, ck_tile::get_warp_size()));
+    }
+
+    // in byte
+    CK_TILE_HOST static constexpr auto GetSmemSize(const Hargs& h)
+    {
+        const auto blocks = BlockSize(h);
+        return ((blocks.x + 1) * h.num_experts + (h.num_experts + 1)) * sizeof(index_t);
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_topk_ids              = h.p_topk_ids;
+        k.p_weights               = h.p_weights;
+        k.p_sorted_token_ids      = h.p_sorted_token_ids;
+        k.p_sorted_weights        = h.p_sorted_weights;
+        k.p_sorted_expert_ids     = h.p_sorted_expert_ids;
+        k.p_moe_buf               = h.p_moe_buf;
+        k.p_total_tokens_post_pad = h.p_total_tokens_post_pad;
+        k.tokens                  = h.tokens;
+        k.num_experts             = h.num_experts;
+        k.moe_buf_bytes           = h.moe_buf_bytes;
+
+        const auto blocks   = BlockSize(h);
+        k.tokens_per_thread = integer_divide_ceil(h.tokens * h.topk, blocks.x);
+        k.unit_size_mdiv    = mdiv{static_cast<uint32_t>(h.unit_size)};
+        k.topk_mdiv         = mdiv{static_cast<uint32_t>(h.topk)};
+        return k;
+    }
+
+    CK_TILE_DEVICE index_t calc_index(index_t total_col, index_t row, index_t col) const
+    {
+        return row * total_col + col;
+    }
+
+    CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes) const
+    {
+        const index_t offset = (blockIdx.x - 1) * blockDim.x + threadIdx.x;
+        if(offset < buf_bytes / 16)
+        {
+            buf[offset] = uint8x16_t{0};
+        }
+    }
+
+    CK_TILE_DEVICE void moe_align_block_size_kernel(const IndexType* __restrict__ topk_id,
+                                                    const WeightType* __restrict__ weights,
+                                                    index_t* p_sorted_token_ids,
+                                                    WeightType* p_sorted_weights,
+                                                    index_t* p_sorted_expert_ids,
+                                                    index_t* p_total_tokens_post_pad,
+                                                    const index_t num_experts,
+                                                    const index_t tokens_per_thread,
+                                                    const index_t numel,
+                                                    const mdiv unit_size_mdiv,
+                                                    const mdiv topk_mdiv,
+                                                    void* smem) const
+    {
+        const index_t tid       = static_cast<index_t>(threadIdx.x);
+        const index_t start_idx = tid * tokens_per_thread;
+
+        index_t* shared_mem = reinterpret_cast<index_t*>(smem);
+
+        index_t* tokens_cnts = shared_mem; // 2d: (blockDim.x + 1, num_experts)
+        index_t* cumsum      = shared_mem + (blockDim.x + 1) * num_experts; // 1: (num_experts + 1)
+        for(int i = 0; i < num_experts; ++i)
+        {
+            tokens_cnts[calc_index(num_experts, tid + 1, i)] = 0;
+        }
+#pragma unroll Problem_::InternalLoadUnroll
+        for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
+        {
+            ++tokens_cnts[calc_index(num_experts, tid + 1, topk_id[i])];
+        }
+        __syncthreads();
+
+        if(tid < num_experts)
+        {
+            tokens_cnts[calc_index(num_experts, 0, tid)] = 0;
+            for(int i = 1; i <= static_cast<index_t>(blockDim.x); ++i)
+            {
+                tokens_cnts[calc_index(num_experts, i, tid)] +=
+                    tokens_cnts[calc_index(num_experts, i - 1, tid)];
+            }
+        }
+
+        // __syncthreads();
+        if(tid == 0)
+        {
+            cumsum[0] = 0;
+            for(int i = 1; i <= num_experts; ++i)
+            {
+                auto current_units = [&]() {
+                    index_t x_ = tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)] +
+                                 unit_size_mdiv.divisor - 1;
+                    index_t y_ = unit_size_mdiv.div(x_);
+                    return max(y_, 1) * unit_size_mdiv.divisor;
+                }();
+                cumsum[i] = cumsum[i - 1] + current_units;
+            }
+            *p_total_tokens_post_pad = cumsum[num_experts];
+        }
+        __syncthreads();
+        if(tid < num_experts)
+        {
+            for(int i = cumsum[tid]; i < cumsum[tid + 1]; i += unit_size_mdiv.divisor)
+            {
+                p_sorted_expert_ids[unit_size_mdiv.div(i)] = tid;
+            }
+        }
+
+#pragma unroll Problem_::InternalLoadUnroll
+        for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
+        {
+            index_t expert_id = topk_id[i];
+            index_t rank_post_pad =
+                tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id];
+            p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i);
+            p_sorted_weights[rank_post_pad]   = weights[i];
+            ++tokens_cnts[calc_index(num_experts, tid, expert_id)];
+        }
+
+        const index_t prefill_token = topk_mdiv.div(numel);
+        if(tid < num_experts)
+        {
+            index_t expert_offset =
+                cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)];
+            while(expert_offset < cumsum[tid + 1])
+            {
+                p_sorted_token_ids[expert_offset] = prefill_token;
+                p_sorted_weights[expert_offset]   = static_cast<WeightType>(0.0);
+                expert_offset++;
+            }
+        }
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        if(blockIdx.x > 0)
+        {
+            if(kargs.p_moe_buf)
+            {
+                moe_buf_set_zero_kernel(reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
+                                        kargs.moe_buf_bytes);
+            }
+            return;
+        }
+        const size_t numel = kargs.tokens * kargs.topk_mdiv.divisor;
+        extern __shared__ char smem[];
+        return moe_align_block_size_kernel(static_cast<const IndexType*>(kargs.p_topk_ids),
+                                           static_cast<const WeightType*>(kargs.p_weights),
+                                           static_cast<IndexType*>(kargs.p_sorted_token_ids),
+                                           static_cast<WeightType*>(kargs.p_sorted_weights),
+                                           static_cast<IndexType*>(kargs.p_sorted_expert_ids),
+                                           static_cast<IndexType*>(kargs.p_total_tokens_post_pad),
+                                           kargs.num_experts,
+                                           kargs.tokens_per_thread,
+                                           numel,
+                                           kargs.unit_size_mdiv,
+                                           kargs.topk_mdiv,
+                                           smem);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
new file mode 100644
index 000000000..bbd47352d
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
+#include <string>
+#include <type_traits>
+
+#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0
+#endif
+
+namespace ck_tile {
+
+// template <typename Problem_, typename Policy_ = MoeSortingPolicy>
+// struct MoeSortingPipeline
+// {
+//     // TODO: this kernel only support warp per row
+//     using Problem    = remove_cvref_t<Problem_>;
+//     using Policy     = remove_cvref_t<Policy_>;
+//     using WeightType = typename Problem::WeightType;
+
+//     template <typename TopkIdWindow, typename WeightWindow>
+//     CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window,
+//                                    const WeightWindow& weight_window,
+//                                     index_t* p_sorted_token_ids,
+//                                     WeightType* p_sorted_weights,
+//                                     index_t* p_sorted_expert_ids,
+//                                     index_t* p_total_tokens_post_pad,
+//                                     const index_t num_experts,
+//                                     const index_t unit_size,
+//                                     const size_t numel,
+//                                     const index_t topk)
+//     {
+//     }
+// };
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp
new file mode 100644
index 000000000..f5218a93e
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/softmax.hpp"
+#include "ck_tile/ops/topk.hpp"
+
+namespace ck_tile {
+
+struct MoeSortingPolicy
+{
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp
new file mode 100644
index 000000000..adde59e35
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename IndexType_, typename WeightType_, index_t InternalLoadUnroll_>
+struct MoeSortingProblem
+{
+    // TODO: this kernel only support warp per row
+    using WeightType = remove_cvref_t<WeightType_>;
+    using IndexType  = remove_cvref_t<IndexType_>;
+
+    static constexpr index_t WarpSize           = get_warp_size();
+    static constexpr index_t WarpsPerBlock      = 1;
+    static constexpr index_t InternalLoadUnroll = InternalLoadUnroll_;
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/moe_sorting.hpp b/include/ck_tile/ops/moe_sorting.hpp
new file mode 100644
index 000000000..b74607f06
--- /dev/null
+++ b/include/ck_tile/ops/moe_sorting.hpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
-- 
GitLab


From 13332998a4ca6dcc8cc5fcd401ca900529e5e65c Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Mon, 11 Nov 2024 09:28:32 +0800
Subject: [PATCH 048/153] Return nullptr when block index is invalid (#1649)

---
 .../ck_tile/ops/fmha/block/page_block_navigator.hpp    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/ck_tile/ops/fmha/block/page_block_navigator.hpp b/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
index e8abdc579..5d158f9fb 100644
--- a/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
+++ b/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
@@ -230,7 +230,15 @@ struct PageBlockNavigator
     CK_TILE_HOST_DEVICE
     DataType* get_block_ptr(index_t block_index) const
     {
-        return physical_blocks + physical_block_indices[block_index] * block_stride + fixed_offset;
+        if(block_index < num_blocks)
+        {
+            return physical_blocks + physical_block_indices[block_index] * block_stride +
+                   fixed_offset;
+        }
+        else
+        {
+            return nullptr;
+        }
     }
 
     CK_TILE_HOST_DEVICE int32_t get_block_index(const WindowOrigin& global_window_origin) const
-- 
GitLab


From 8ef8a994e73370d69980a4df7377ed4ce8ed05c8 Mon Sep 17 00:00:00 2001
From: valarLip <103567126+valarLip@users.noreply.github.com>
Date: Mon, 11 Nov 2024 16:02:28 +0800
Subject: [PATCH 049/153] [CK_TILE] add more stride for layernorm to support
 un-continuous Tensor (#1650)

* [CK_TILE] add more stride for layernorm to support un-continuous Tensor

* align CK coding style

* extend strides to layernrom expample

* clang-format...
---
 .../02_layernorm2d/layernorm2d_fwd.cpp        | 63 ++++++++++++-------
 .../kernel/layernorm2d_fwd_kernel.hpp         | 23 ++++---
 2 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
index 8f029c212..b49c04619 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -25,7 +25,10 @@ auto create_args(int argc, char* argv[])
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("m", "3328", "m dimension")
         .insert("n", "4096", "n dimension")
-        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("x_stride", "-1", "x row_stride, if -1 then equal to n")
+        .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n")
+        .insert("y_stride", "-1", "y row_stride, if -1 then equal to n")
+        .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
         .insert("e", "1e-5", "epsilon")
         .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
         .insert("v", "1", "cpu validation or not")
@@ -54,11 +57,20 @@ template <typename InDataType,
           bool SaveMeanVar>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    ck_tile::index_t m      = arg_parser.get_int("m");
-    ck_tile::index_t n      = arg_parser.get_int("n");
-    ck_tile::index_t stride = arg_parser.get_int("stride");
-    if(stride < 0)
-        stride = n;
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride");
+    if(xr_stride < 0)
+        xr_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
+    ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride");
+    if(yr_stride < 0)
+        yr_stride = n;
     float epsilon       = arg_parser.get_float("e");
     std::string prec_i  = arg_parser.get_str("prec_i");
     std::string prec_o  = arg_parser.get_str("prec_o");
@@ -89,7 +101,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         return false;
     }
 
-    assert(stride >= n);
+    assert(x_stride >= n);
 
     using TypeConfig = LayerNormTypeConfig<InDataType, OutDataType, XScaleDataType, YScaleDataType>;
 
@@ -108,15 +120,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using ComputeDataType = typename TypeConfig::ComputeDataType;
 
     // host verify
-    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
     ck_tile::HostTensor<GammaDataType> gamma_host({n});
     ck_tile::HostTensor<BetaDataType> beta_host({n});
 
-    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {stride, 1});
-    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {xr_stride, 1});
+    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {yr_stride, 1});
 
-    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {stride, 1});
-    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {y_stride, 1});
 
     ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
     ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
@@ -162,7 +174,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }();
 
     std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
+              << ", yr_stride:" << yr_stride << std::flush;
 
     layernorm2d_fwd_traits traits{
         prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, fused_add, fused_quant};
@@ -182,7 +196,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                               epsilon,
                               m,
                               n,
-                              stride};
+                              x_stride,   // x row_stride
+                              xr_stride,  // x residule row stride
+                              y_stride,   // y row stride
+                              yr_stride}; // y residule row stride
 
     float ave_time = layernorm2d_fwd(
         traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
@@ -285,7 +302,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
         y_buf.FromDevice(y_host_dev.data());
 
-        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {stride, 1});
+        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {yr_stride, 1});
         if(fused_add == 1)
         {
             y_residual_buf.FromDevice(y_residual_host_dev.data());
@@ -293,7 +310,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
         auto [rtol, atol] = get_elimit<InDataType>();
 
-        if(stride == n)
+        if(x_stride == n)
         {
             pass = ck_tile::check_err(
                 y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
@@ -310,10 +327,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
         {
             for(int i_r = 0; i_r < m; i_r++)
             {
-                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * stride,
-                                                      y_host_dev.begin() + i_r * stride + n);
-                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * stride,
-                                                      y_host_ref.begin() + i_r * stride + n);
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * y_stride,
+                                                      y_host_dev.begin() + i_r * y_stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * y_stride,
+                                                      y_host_ref.begin() + i_r * y_stride + n);
                 pass &= ck_tile::check_err(y_host_dev_row,
                                            y_host_ref_row,
                                            std::string("OUT[") + std::to_string(i_r) +
@@ -323,10 +340,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 if(fused_add == 1)
                 {
                     std::vector<YResidualDataType> y_residual_host_dev_row(
-                        y_residual_host_dev.begin() + i_r * stride,
-                        y_residual_host_dev.begin() + i_r * stride + n);
+                        y_residual_host_dev.begin() + i_r * yr_stride,
+                        y_residual_host_dev.begin() + i_r * yr_stride + n);
                     std::vector<YResidualDataType> y_residual_host_ref_row(
-                        x_host.begin() + i_r * stride, x_host.begin() + i_r * stride + n);
+                        x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n);
                     pass &= ck_tile::check_err(y_residual_host_dev_row,
                                                y_residual_host_ref_row,
                                                std::string("ADD[") + std::to_string(i_r) +
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
index f5a214ba5..10218e808 100644
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -28,7 +28,10 @@ struct Layernorm2dFwdHostArgs
 
     index_t m;
     index_t n;
-    index_t stride; // row_stride
+    index_t x_stride;  // x row_stride
+    index_t xr_stride; // x residule row stride
+    index_t y_stride;  // y row stride
+    index_t yr_stride; // y residule row stride
 };
 
 // TODO: Extract some type to wrapper class
@@ -93,7 +96,10 @@ struct Layernorm2dFwd
 
         index_t m;
         index_t n;
-        index_t stride; // row_stride
+        index_t x_stride;  // x row_stride
+        index_t xr_stride; // x residule row stride
+        index_t y_stride;  // y row stride
+        index_t yr_stride; // y residule row stride
     };
     using Hargs = Layernorm2dFwdHostArgs;
 
@@ -112,7 +118,10 @@ struct Layernorm2dFwd
                      hargs.epsilon,
                      hargs.m,
                      hargs.n,
-                     hargs.stride};
+                     hargs.x_stride,
+                     hargs.xr_stride,
+                     hargs.y_stride,
+                     hargs.yr_stride};
     }
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
@@ -182,7 +191,7 @@ struct Layernorm2dFwd
             const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<const XDataType*>(kargs.p_x),
                 make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.x_stride, 1),
                 number<Vector_N>{},
                 number<1>{});
 
@@ -201,7 +210,7 @@ struct Layernorm2dFwd
                 const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                     static_cast<const XResidualDataType*>(kargs.p_x_residual),
                     make_tuple(kargs.m, kargs.n),
-                    make_tuple(kargs.stride, 1),
+                    make_tuple(kargs.xr_stride, 1),
                     number<Vector_N>{},
                     number<1>{});
 
@@ -250,7 +259,7 @@ struct Layernorm2dFwd
             auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<YDataType*>(kargs.p_y),
                 make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.y_stride, 1),
                 number<Vector_N>{},
                 number<1>{});
 
@@ -266,7 +275,7 @@ struct Layernorm2dFwd
                 auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                     static_cast<YResidualDataType*>(kargs.p_y_residual),
                     make_tuple(kargs.m, kargs.n),
-                    make_tuple(kargs.stride, 1),
+                    make_tuple(kargs.yr_stride, 1),
                     number<Vector_N>{},
                     number<1>{});
 
-- 
GitLab


From 5fb150dbe700eba180feb5b27973a8ba95fae2ce Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 11 Nov 2024 09:25:08 -0800
Subject: [PATCH 050/153] restore collecting performance of mixed prec gemms
 (#1648)

---
 script/process_perf_data.py | 4 ++--
 script/process_qa_data.sh   | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/script/process_perf_data.py b/script/process_perf_data.py
index b82a7c289..3892206e4 100644
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -133,12 +133,12 @@ def parse_logfile(logfile):
             if 'Best Perf' in line:
                 lst=line.split()
                 res.append(lst[4])
-    elif 'onnx_gemm' in logfile or 'mixed_gemm' in logfile:
+    elif 'onnx_gemm' in logfile:
         for line in open(logfile):
             if 'Best Perf' in line:
                 lst=line.split()
                 res.append(lst[33])
-    elif 'splitK_gemm' in logfile:
+    elif 'splitK_gemm' in logfile or 'mixed_gemm' in logfile:
         for line in open(logfile):
             if 'Best Perf' in line:
                 lst=line.split()
diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh
index d6083d2fc..c9a1645f6 100755
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -22,6 +22,7 @@ python3 process_perf_data.py perf_gemm_bilinear.log
 python3 process_perf_data.py perf_reduction.log
 python3 process_perf_data.py perf_splitK_gemm.log
 python3 process_perf_data.py perf_onnx_gemm.log
+python3 process_perf_data.py perf_mixed_gemm.log
 
 file=./perf_fmha_fwd_gfx942.log
 if [ -e "$file" ]; then
-- 
GitLab


From 2b6458ddf243904cecf4c54b48c9dafa60ff80df Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Tue, 12 Nov 2024 10:08:25 +0800
Subject: [PATCH 051/153] [CK Tile] Improve the Layout, Padding, and Alignment
 features of CK Tile GEMM (#1651)

* Finished the feature

* Modified the test file

* Test case update

* addresss comment

* Addressed the review comment

* Fixed the CI error
---
 example/ck_tile/03_gemm/README.md             |   3 +
 example/ck_tile/03_gemm/gemm_basic.cpp        |  19 +-
 example/ck_tile/03_gemm/gemm_mem_pipeline.cpp |  10 +-
 include/ck_tile/core/tensor/shuffle_tile.hpp  |   2 +-
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |   2 +
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  70 ++--
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |   6 +-
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     |  63 +++-
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp | 330 ++++++++++++++----
 .../gemm/pipeline/gemm_pipeline_problem.hpp   | 154 ++++++--
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 316 ++++++++++++++---
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |  16 +-
 .../gemm/test_gemm_mem_pipeline_util.hpp      |  12 +-
 13 files changed, 781 insertions(+), 222 deletions(-)

diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md
index aacbdf686..e9ffe72a9 100644
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -8,7 +8,10 @@ This folder contains example for GEMM using ck_tile tile-programming implementat
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
 sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the gemm calculation
 make tile_example_gemm_basic -j
+# The memory bound pipeline on the gemm calculation
+make tile_example_gemm_mem_pipeline -j
 ```
 This will result in an executable `build/bin/tile_example_gemm_basic`
 
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 09427217c..b7d869344 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -17,10 +17,11 @@
 template <typename ALayout, typename BLayout, typename CLayout>
 float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 {
-    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadA        = true;
-    constexpr bool kPadB        = true;
-    constexpr bool kPadC        = true;
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
     constexpr bool kTilePermute = false;
     // The rank and permutation will also be generate out by the CodeGen part.
     constexpr ck_tile::index_t kOutputRank = 2;
@@ -56,8 +57,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
         CShuffleEpilogue,
         ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
                                                                    CDataType,
-                                                                   kPadA,
-                                                                   kPadB,
+                                                                   kPadM,
+                                                                   kPadN,
                                                                    kTilePermute,
                                                                    kOutputRank,
                                                                    1,
@@ -65,13 +66,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
                                                                    TilePartitioner::kM,
                                                                    TilePartitioner::kN>>,
         ck_tile::Default2DEpilogue<
-            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>>;
+            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;
 
     using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy<ALayout, BLayout, CLayout>;
+    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy;
     using CodegenGemmPipeline =
         ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
     // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
index 2ee0395e4..ff9d8bad3 100644
--- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
+++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
@@ -31,9 +31,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
     constexpr ck_tile::index_t K_Warp_Tile = 8;
 
     // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadA = true;
-    constexpr bool kPadB = true;
-    constexpr bool kPadC = true;
+    constexpr bool kPadM = true;
+    constexpr bool kPadN = true;
+    constexpr bool kPadK = true;
 
     constexpr int kBlockPerCu = 1;
 
@@ -46,9 +46,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
     using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
 
     using GemmEpilogue = ck_tile::Default2DEpilogue<
-        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, false, kPadC>>;
+        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>;
 
-    using Traits = ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
 
     using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
diff --git a/include/ck_tile/core/tensor/shuffle_tile.hpp b/include/ck_tile/core/tensor/shuffle_tile.hpp
index da3c7117e..55e3274cd 100644
--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -170,7 +170,7 @@ CK_TILE_DEVICE void shuffle_tile(OutTensor& out, const InTensor& in)
     }
     else
     {
-        // NOT implemented
+        static_assert(false, "The shuffle should always happen!");
     }
 }
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index fbb05e164..a3a29bb54 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -863,6 +863,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             constexpr index_t K0 = kKPerBlock / K1;
             constexpr index_t N2 = get_warp_size() / K0;
             constexpr index_t N1 = kBlockSize / get_warp_size();
+            static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
+            static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
             constexpr index_t N0 = kNPerBlock / (N2 * N1);
             static_assert(N0 != 0);
 
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 1671ddad3..96af6e826 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -115,12 +115,22 @@ struct GemmKernel
             }
         }();
 
-        auto a_pad_view = pad_tensor_view(
-            a_tensor_view,
-            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
-            // somehow clang-format is splitting below line into multiple.
-            // clang-format off
-            sequence<false, GemmPipeline::kPadA>{});
+        auto a_pad_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(
+                    a_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+                    sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    a_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+                    sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
         // clang-format on
 
         auto a_block_window = make_tile_window(
@@ -128,12 +138,22 @@ struct GemmKernel
             make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
             {i_m, 0});
 
-        auto b_pad_view = pad_tensor_view(
-            b_tensor_view,
-            make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
-            // clang-format off
-            sequence<false, GemmPipeline::kPadB>{});
-        // clang-format on
+        auto b_pad_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return pad_tensor_view(
+                    b_tensor_view,
+                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+                    sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    b_tensor_view,
+                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+                    sequence<GemmPipeline::kPadN, false>{});
+            }
+        }();
 
         auto b_block_window = make_tile_window(
             b_pad_view,
@@ -171,18 +191,28 @@ struct GemmKernel
             }
         }();
 
-        auto c_pad_view = pad_tensor_view(
-            c_tensor_view,
-            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
-            // clang-format off
-            sequence<false, GemmPipeline::kPadC>{});
-        // clang-format on
-        auto c_block_window = make_tile_window(
+        auto c_pad_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(
+                    c_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+                    sequence<false, GemmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    c_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+                    sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        auto CBlockWindow_pad = make_tile_window(
             c_pad_view,
             make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
             {i_m, i_n});
 
-        EpiloguePipeline{}(c_block_window, c_block_tile);
+        EpiloguePipeline{}(CBlockWindow_pad, c_block_tile);
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index b9b45d3f4..85c5c5805 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -113,9 +113,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     static constexpr index_t VectorSizeB = Problem::VectorSizeB;
     static constexpr index_t VectorSizeC = Problem::VectorSizeC;
 
-    static constexpr bool kPadA = Problem::kPadA;
-    static constexpr bool kPadB = Problem::kPadB;
-    static constexpr bool kPadC = Problem::kPadC;
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
 
     // Where is the right place for HasHotLoop and TailNum ???
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index a2424290e..c0817e736 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -33,9 +33,9 @@ struct GemmPipelineAGmemBGmemCRegV1
     static constexpr index_t VectorSizeB = Problem::VectorSizeB;
     static constexpr index_t VectorSizeC = Problem::VectorSizeC;
 
-    static constexpr bool kPadA = Problem::kPadA;
-    static constexpr bool kPadB = Problem::kPadB;
-    static constexpr bool kPadC = Problem::kPadC;
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
     {
@@ -101,11 +101,8 @@ struct GemmPipelineAGmemBGmemCRegV1
                              Policy::template MakeADramTileDistribution<Problem>());
 
         // A LDS tile window for store
-        auto a_copy_lds_window =
-            make_tile_window(a_lds_block,
-                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
-                             {0, 0},
-                             a_copy_dram_window.get_tile_distribution());
+        auto a_copy_lds_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
 
         // B DRAM tile window for load
         auto b_copy_dram_window =
@@ -115,11 +112,8 @@ struct GemmPipelineAGmemBGmemCRegV1
                              Policy::template MakeBDramTileDistribution<Problem>());
 
         // B LDS tile window for store
-        auto b_copy_lds_window =
-            make_tile_window(b_lds_block,
-                             make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
-                             {0, 0},
-                             b_copy_dram_window.get_tile_distribution());
+        auto b_copy_lds_window = make_tile_window(
+            b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
 
         // A LDS tile for block GEMM
         auto a_lds_gemm_window = make_tile_window(
@@ -149,12 +143,32 @@ struct GemmPipelineAGmemBGmemCRegV1
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
-            store_tile(a_copy_lds_window, a_block_tile_tmp);
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                    Policy::template MakeShuffledARegBlockDescriptor<Problem>());
+                shuffle_tile(a_shuffle_tmp, a_block_tile);
+                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
+                store_tile(a_copy_lds_window, a_block_tile_tmp);
+            }
+            else
+            {
+                store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
+            }
 
             // LDS write 0
-            const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
-            store_tile(b_copy_lds_window, b_block_tile_tmp);
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
+                    Policy::template MakeShuffledBRegBlockDescriptor<Problem>());
+                shuffle_tile(b_shuffle_tmp, b_block_tile);
+                const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_shuffle_tmp);
+                store_tile(b_copy_lds_window, b_block_tile_tmp);
+            }
+            else
+            {
+                store_tile(b_copy_lds_window, tile_elementwise_in(b_element_func, b_block_tile));
+            }
         }
 
         index_t iCounter = num_loop - 1;
@@ -180,8 +194,19 @@ struct GemmPipelineAGmemBGmemCRegV1
             store_tile(a_copy_lds_window, a_block_tile_tmp);
 
             // LDS write i + 1
-            const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
-            store_tile(b_copy_lds_window, b_block_tile_tmp);
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                auto b_shuffle_tmp_loop = make_static_distributed_tensor<BDataType>(
+                    Policy::template MakeShuffledBRegBlockDescriptor<Problem>());
+                shuffle_tile(b_shuffle_tmp_loop, b_block_tile);
+                store_tile(b_copy_lds_window,
+                           tile_elementwise_in(b_element_func, b_shuffle_tmp_loop));
+            }
+            else
+            {
+                const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
+                store_tile(b_copy_lds_window, b_block_tile_tmp);
+            }
 
             iCounter--;
         }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index 199ba56aa..c765b3ce9 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -11,6 +11,7 @@ namespace ck_tile {
 // Default policy class should not be templated, put template on member functions instead
 struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
 {
+
 #if 0
     // 2d
     template <typename Problem>
@@ -116,6 +117,20 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
 
         return smem_size;
     }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        return Problem::VectorLoadSize / sizeof(ADataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB()
+    {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        return Problem::VectorLoadSize / sizeof(BDataType);
+    }
 #elif 1
     // fake XOR
     template <typename Problem>
@@ -192,80 +207,269 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
     {
         using ADataType = remove_cvref_t<typename Problem::ADataType>;
-
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr index_t K1 = 16 / sizeof(ADataType);
-        constexpr index_t K0 = kKPerBlock / K1;
-        constexpr index_t M2 = get_warp_size() / K0;
-#if 1 // coalesce reading for each blocks
-        constexpr index_t M1 = kBlockSize / get_warp_size();
-        static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
-        static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
-        constexpr index_t M0 = kMPerBlock / (M2 * M1);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 1>>{});
-#else // coalesce reading for each warps
-        constexpr index_t M0 = kBlockSize / get_warp_size();
-        constexpr index_t M1 = kMPerBlock / (M2 * M0);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<1, 1>>{});
-#endif
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M0           = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetSmemPackA<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * M0))
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = 16 / sizeof(ADataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t M2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (M2 * K0) == 0)
+            {
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t M0 = BlockSize / get_warp_size();
+                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
     {
         using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t N0           = NPerBlock / N1;
+            constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % N1 == 0);
+            constexpr index_t K3    = total_pixels / N1;
+            constexpr index_t KPack = GetSmemPackB<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * N0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * N0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t N2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (N2 * K0) == 0)
+            {
+                constexpr index_t N1 = BlockSize / get_warp_size();
+                static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
+                static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
+                constexpr index_t N0 = NPerBlock / (N2 * N1);
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            // coalesce reading for each warps
+            else
+            {
+                constexpr index_t N0 = BlockSize / get_warp_size();
+                constexpr index_t N1 = NPerBlock / (N2 * N0);
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
 
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegBlockDescriptor()
+    {
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        static_assert(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>);
         constexpr index_t kBlockSize = Problem::kBlockSize;
-
         constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
 
-        constexpr index_t K1 = 16 / sizeof(BDataType);
-        constexpr index_t K0 = kKPerBlock / K1;
-        constexpr index_t N2 = get_warp_size() / K0;
-#if 1 // coalesce reading for each blocks
-        constexpr index_t N1 = kBlockSize / get_warp_size();
-        static_assert(N2 != 0, "M2 is zero, which will lead to a division by zero error.");
-        static_assert(N1 != 0, "M1 is zero, which will lead to a division by zero error.");
-        constexpr index_t N0 = kNPerBlock / (N2 * N1);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 1>>{});
-#else // coalesce reading for each warps
-        constexpr index_t N0 = kBlockSize / get_warp_size();
-        constexpr index_t N1 = kNPerBlock / (N2 * N0);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<1, 1>>{});
-#endif
+        constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+        constexpr index_t N0           = kNPerBlock / N1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % N1 == 0);
+        constexpr index_t K3     = total_pixels / N1;
+        constexpr index_t kKPack = GetSmemPackB<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * N0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * N0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * N0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDescriptor()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>);
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = kMPerBlock / M1;
+        constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetSmemPackA<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * M0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
     }
 
     template <typename Problem>
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 1156f549b..3c43790bd 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -3,40 +3,133 @@
 
 #pragma once
 
-#include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 
 namespace ck_tile {
 
-static constexpr int _VectorSize = 16;
-
 template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename TileGemmTraits_>
-struct GemmPipelineProblem
+struct GemmPipelineProblemBase
 {
-    using ADataType      = remove_cvref_t<ADataType_>;
-    using BDataType      = remove_cvref_t<BDataType_>;
-    using CDataType      = remove_cvref_t<CDataType_>;
+    using GemmTraits = remove_cvref_t<TileGemmTraits_>;
+
+    using ADataType = remove_cvref_t<ADataType_>;
+    using BDataType = remove_cvref_t<BDataType_>;
+    using CDataType = remove_cvref_t<CDataType_>;
+
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
-    using GemmTraits     = remove_cvref_t<TileGemmTraits_>;
 
     using ALayout = remove_cvref_t<typename GemmTraits::ALayout>;
     using BLayout = remove_cvref_t<typename GemmTraits::BLayout>;
     using CLayout = remove_cvref_t<typename GemmTraits::CLayout>;
 
-    static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
-    static constexpr bool kPadA         = GemmTraits::kPadA;
-    static constexpr bool kPadB         = GemmTraits::kPadB;
-    static constexpr bool kPadC         = GemmTraits::kPadC;
+    static constexpr index_t VectorLoadSize = GemmTraits::_VectorSize;
+    static constexpr index_t kBlockSize     = BlockGemmShape::NumWarps * get_warp_size();
+
+    static constexpr bool kPadM = GemmTraits::kPadM;
+    static constexpr bool kPadN = GemmTraits::kPadN;
+    static constexpr bool kPadK = GemmTraits::kPadK;
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA()
+    {
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t pixels_per_thread =
+                BlockGemmShape::kM * BlockGemmShape::kK / kBlockSize;
+            return pixels_per_thread < VectorLoadSize / sizeof(ADataType)
+                       ? pixels_per_thread
+                       : VectorLoadSize / sizeof(ADataType);
+        }
+        else
+        {
+            return VectorLoadSize / sizeof(ADataType);
+        }
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentB()
+    {
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            constexpr index_t pixels_per_thread =
+                BlockGemmShape::kN * BlockGemmShape::kK / kBlockSize;
+            return pixels_per_thread < VectorLoadSize / sizeof(BDataType)
+                       ? pixels_per_thread
+                       : VectorLoadSize / sizeof(BDataType);
+        }
+        else
+        {
+            return VectorLoadSize / sizeof(BDataType);
+        }
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentC()
+    {
+        if constexpr(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t N1 = kBlockSize / get_warp_size();
+            constexpr index_t N2 = std::min(BlockGemmShape::kN / N1, get_warp_size());
+            constexpr index_t M0 = get_warp_size() / N2;
+            constexpr index_t M1 = BlockGemmShape::kM / M0;
 
-    static constexpr index_t VectorSizeA = kPadA ? 1 : _VectorSize / sizeof(ADataType);
-    static constexpr index_t VectorSizeB = kPadB ? 1 : _VectorSize / sizeof(BDataType);
-    static constexpr index_t VectorSizeC = kPadC ? 1 : _VectorSize / sizeof(CDataType);
+            return std::min(M1, static_cast<index_t>(VectorLoadSize / sizeof(CDataType)));
+        }
+        else
+        {
+            constexpr index_t M1 = kBlockSize / get_warp_size();
+            constexpr index_t M2 = std::min(BlockGemmShape::kM / M1, get_warp_size());
+            constexpr index_t N0 = get_warp_size() / M2;
+            constexpr index_t N1 = BlockGemmShape::kN / N0;
+
+            return std::min(N1, static_cast<index_t>(VectorLoadSize / sizeof(CDataType)));
+        }
+    }
+
+    static constexpr index_t VectorSizeA = []() {
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            return kPadK ? 1 : GetAlignmentA();
+        }
+        else
+        {
+            return kPadM ? 1 : GetAlignmentA();
+        }
+    }();
+
+    static constexpr index_t VectorSizeB = []() {
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+        {
+            return kPadN ? 1 : GetAlignmentB();
+        }
+        else
+        {
+            return kPadK ? 1 : GetAlignmentB();
+        }
+    }();
+
+    static constexpr index_t VectorSizeC = []() {
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            return kPadN ? 1 : GetAlignmentC();
+        }
+        else
+        {
+            return kPadM ? 1 : GetAlignmentC();
+        }
+    }();
 };
 
+// Alias for GemmPipelineProblem
+template <typename ADataType_,
+          typename BDataType_,
+          typename CDataType_,
+          typename BlockGemmShape_,
+          typename TileGemmTraits_>
+using GemmPipelineProblem =
+    GemmPipelineProblemBase<ADataType_, BDataType_, CDataType_, BlockGemmShape_, TileGemmTraits_>;
+
 template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
@@ -45,30 +138,15 @@ template <typename ADataType_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
           TailNumber TailNum_              = TailNumber::Full>
-struct UniversalGemmPipelineProblem
+struct UniversalGemmPipelineProblem : public GemmPipelineProblemBase<ADataType_,
+                                                                     BDataType_,
+                                                                     CDataType_,
+                                                                     BlockGemmShape_,
+                                                                     TileGemmTraits_>
 {
-    using ADataType      = remove_cvref_t<ADataType_>;
-    using BDataType      = remove_cvref_t<BDataType_>;
-    using CDataType      = remove_cvref_t<CDataType_>;
-    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
-    using GemmTraits     = remove_cvref_t<TileGemmTraits_>;
-
-    using ALayout = remove_cvref_t<typename GemmTraits::ALayout>;
-    using BLayout = remove_cvref_t<typename GemmTraits::BLayout>;
-    using CLayout = remove_cvref_t<typename GemmTraits::CLayout>;
-
-    static constexpr auto Scheduler     = Scheduler_;
-    static constexpr auto HasHotLoop    = HasHotLoop_;
-    static constexpr auto TailNum       = TailNum_;
-    static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
-
-    static constexpr bool kPadA = GemmTraits::kPadA;
-    static constexpr bool kPadB = GemmTraits::kPadB;
-    static constexpr bool kPadC = GemmTraits::kPadC;
-
-    static constexpr index_t VectorSizeA = kPadA ? _VectorSize / sizeof(ADataType) : 1;
-    static constexpr index_t VectorSizeB = kPadB ? _VectorSize / sizeof(BDataType) : 1;
-    static constexpr index_t VectorSizeC = kPadC ? _VectorSize / sizeof(CDataType) : 1;
+    static constexpr auto Scheduler  = Scheduler_;
+    static constexpr auto HasHotLoop = HasHotLoop_;
+    static constexpr auto TailNum    = TailNum_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 7044a5314..207f1f9e4 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -9,12 +9,8 @@
 namespace ck_tile {
 
 // UniversalGemm Policy
-template <typename LayoutA_, typename LayoutB_, typename LayoutC_>
 struct UniversalGemmPipelineAgBgCrPolicy
 {
-    using LayoutA = remove_cvref_t<LayoutA_>;
-    using LayoutB = remove_cvref_t<LayoutB_>;
-    using LayoutC = remove_cvref_t<LayoutC_>;
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
@@ -34,13 +30,14 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                 TransposeC>;
 
         using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
 
         constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
         constexpr index_t K1        = WarpGemm::kK;
         constexpr index_t K0        = KPerBlock / K1;
 
-        if constexpr(std::is_same<tensor_layout::gemm::RowMajor, LayoutA>::value)
+        if constexpr(std::is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
             constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
                                                   ? 1
@@ -176,13 +173,15 @@ struct UniversalGemmPipelineAgBgCrPolicy
 
         using BDataType = remove_cvref_t<typename Problem::BDataType>;
 
+        using BLayout = remove_cvref_t<typename Problem::BLayout>;
+
         constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
 
         constexpr index_t K1 = WarpGemm::kK;
         constexpr index_t K0 = KPerBlock / K1;
 
-        if constexpr(std::is_same<tensor_layout::gemm::ColumnMajor, LayoutB>::value)
+        if constexpr(std::is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
         {
             // NLdsLayer * K0 as logical Bank
             constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
@@ -331,72 +330,285 @@ struct UniversalGemmPipelineAgBgCrPolicy
         return smem_size;
     }
 
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        return Problem::VectorLoadSize / sizeof(ADataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB()
+    {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        return Problem::VectorLoadSize / sizeof(BDataType);
+    }
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
     {
-        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType,
-                                                Problem::BlockGemmShape::WarpTile::at(I0),
-                                                Problem::BlockGemmShape::WarpTile::at(I1),
-                                                Problem::BlockGemmShape::WarpTile::at(I2),
-                                                TransposeC>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
 
         constexpr index_t BlockSize = Problem::kBlockSize;
 
         constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
 
-        constexpr index_t K1 = WarpGemm::kK;
-        constexpr index_t K0 = KPerBlock / K1;
-        constexpr index_t M2 = get_warp_size() / K0;
-
-        constexpr index_t M1 = BlockSize / get_warp_size();
-        static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
-        static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
-        constexpr index_t M0 = MPerBlock / (M2 * M1);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 1>>{});
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M0           = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetSmemPackA<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * M0) == 0)
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t M2 = get_warp_size() / K0;
+            if constexpr(get_warp_size() % (M2 * K0) == 0)
+            {
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t M0 = BlockSize / get_warp_size();
+                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
     {
-        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType,
-                                                Problem::BlockGemmShape::WarpTile::at(I0),
-                                                Problem::BlockGemmShape::WarpTile::at(I1),
-                                                Problem::BlockGemmShape::WarpTile::at(I2),
-                                                TransposeC>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
 
         constexpr index_t BlockSize = Problem::kBlockSize;
 
         constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
 
-        constexpr index_t K1 = WarpGemm::kK;
-        constexpr index_t K0 = KPerBlock / K1;
-        constexpr index_t N2 = get_warp_size() / K0;
-
-        constexpr index_t N1 = BlockSize / get_warp_size();
-        static_assert(N2 != 0, "M2 is zero, which will lead to a division by zero error.");
-        static_assert(N1 != 0, "M1 is zero, which will lead to a division by zero error.");
-        constexpr index_t N0 = NPerBlock / (N2 * N1);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 1>>{});
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t N0           = NPerBlock / N1;
+            constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % N1 == 0);
+            constexpr index_t K3    = total_pixels / N1;
+            constexpr index_t KPack = GetSmemPackB<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * N0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * N0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t N2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (N2 * K0) == 0)
+            {
+                constexpr index_t N1 = BlockSize / get_warp_size();
+                static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
+                static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
+                constexpr index_t N0 = NPerBlock / (N2 * N1);
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            // coalesce reading for each warps
+            else
+            {
+                constexpr index_t N0 = BlockSize / get_warp_size();
+                constexpr index_t N1 = NPerBlock / (N2 * N0);
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDescriptor()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = MPerBlock / M1;
+        constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetSmemPackB<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * M0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = BlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+            static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegBlockDescriptor()
+    {
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        static_assert(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+        constexpr index_t N0           = NPerBlock / N1;
+        constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
+        static_assert(total_pixels % N1 == 0);
+        constexpr index_t K3     = total_pixels / N1;
+        constexpr index_t kKPack = GetSmemPackB<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * N0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * N0);
+            constexpr index_t K0 = BlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * N0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+            static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
     }
 
     template <typename Problem>
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index 9d050be2f..34756c3ff 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -3,19 +3,23 @@
 
 #pragma once
 
+#include "ck_tile/core.hpp"
+
 namespace ck_tile {
 
-template <bool kPadA_,
-          bool kPadB_,
-          bool kPadC_,
+template <bool kPadM_,
+          bool kPadN_,
+          bool kPadK_,
           typename ALayout_,
           typename BLayout_,
           typename CLayout_>
 struct TileGemmTraits
 {
-    static constexpr bool kPadA = kPadA_;
-    static constexpr bool kPadB = kPadB_;
-    static constexpr bool kPadC = kPadC_;
+    static constexpr bool kPadM = kPadM_;
+    static constexpr bool kPadN = kPadN_;
+    static constexpr bool kPadK = kPadK_;
+
+    static constexpr int _VectorSize = 16;
 
     using ALayout = ALayout_;
     using BLayout = BLayout_;
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
index 1b243ab43..6b4789833 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
@@ -53,9 +53,9 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
         constexpr ck_tile::index_t N_Warp_Tile = 32;
         constexpr ck_tile::index_t K_Warp_Tile = 8;
 
-        constexpr bool kPadA = true;
-        constexpr bool kPadB = true;
-        constexpr bool kPadC = true;
+        constexpr bool kPadM = true;
+        constexpr bool kPadN = true;
+        constexpr bool kPadK = true;
 
         constexpr int kBlockPerCu = 1;
 
@@ -68,9 +68,9 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
         using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
 
         using GemmEpilogue = ck_tile::Default2DEpilogue<
-            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, false, kPadC>>;
+            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>;
 
-        using Traits = ck_tile::TileGemmTraits<kPadA, kPadB, kPadC, ALayout, BLayout, CLayout>;
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
 
         using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
@@ -108,7 +108,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
 
             if(s.log_level_ > 0)
             {
-                std::cout << "Lunching kernel with args:"
+                std::cout << "Launching kernel with args:"
                           << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                           << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
                           << "}" << std::endl;
-- 
GitLab


From 489c78d0735b7817859a22722e381f62f345cea7 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 12 Nov 2024 09:35:33 -0800
Subject: [PATCH 052/153] test rocm6.3 rc1 build 20 (#1659)

---
 Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e2e2bc276..791d1d9f3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,10 +24,10 @@ RUN if [ "$ROCMVERSION" != "6.3" ]; then \
         sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
         sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
     elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3.0.1-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3.0.1-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3.0.1 rel-5 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=2033700; \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \
+        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \
+        amdgpu-repo --amdgpu-build=2074281; \
     fi
 
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
-- 
GitLab


From d20735691ccb9429ed66f42f831385c709707d62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 13 Nov 2024 11:46:18 +0100
Subject: [PATCH 053/153] [CK TILE] Update gemm universal pipeline (#1644)

* [CK TILE] Update gemm universal pipeline

* Fixes

* fix

* Rebase
---
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 399 +++++-------------
 1 file changed, 116 insertions(+), 283 deletions(-)

diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 207f1f9e4..94b0faf03 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -18,289 +18,136 @@ struct UniversalGemmPipelineAgBgCrPolicy
 
     static constexpr bool TransposeC = true;
 
+    template <typename Problem, typename DataType, index_t MNPerBlock>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorLoadSize()
+    {
+        constexpr index_t BlockSize           = Problem::kBlockSize;
+        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
+        constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
+
+        if constexpr(elements_per_thread % (16 / sizeof(DataType)) == 0)
+        {
+            return (16 / sizeof(DataType));
+        }
+        else if constexpr(elements_per_thread % (8 / sizeof(DataType)) == 0)
+        {
+            return (8 / sizeof(DataType));
+        }
+        else if constexpr(elements_per_thread % (4 / sizeof(DataType)) == 0 &&
+                          sizeof(DataType) >= 4)
+        {
+            return (4 / sizeof(DataType));
+        }
+        else if constexpr(elements_per_thread % (2 / sizeof(DataType)) == 0 &&
+                          sizeof(DataType) >= 2)
+        {
+            return (2 / sizeof(DataType));
+        }
+        else
+        {
+            return 1;
+        }
+    }
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
     {
-        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType,
-                                                Problem::BlockGemmShape::WarpTile::at(I0),
-                                                Problem::BlockGemmShape::WarpTile::at(I1),
-                                                Problem::BlockGemmShape::WarpTile::at(I2),
-                                                TransposeC>;
 
         using ADataType = remove_cvref_t<typename Problem::ADataType>;
-        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
 
         constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t K1        = WarpGemm::kK;
-        constexpr index_t K0        = KPerBlock / K1;
-
-        if constexpr(std::is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-        {
-            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
-                                                  ? 1
-                                                  : 32 * 4 / KPerBlock / sizeof(ADataType);
-            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(K0 * number<MLdsLayer>{}, number<MPerBlock / MLdsLayer>{}, K1),
-                make_tuple(K1, number<KPerBlock * MLdsLayer>{}, I1));
-
-            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-                a_lds_block_desc,
-                make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
-                                                         number<K0 * MLdsLayer>{})),
-                           make_pass_through_transform(K1)),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}));
-
-            constexpr auto a_lds_block_desc_ak0_kMLdsLayer_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(K0, number<MLdsLayer>{})),
-                           make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
-                           make_pass_through_transform(K1)),
-                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
-
-            constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
-                a_lds_block_desc_ak0_kMLdsLayer_m_ak1,
-                make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{}))),
-                make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
-                make_tuple(sequence<1>{}, sequence<0>{}));
-
-            return a_lds_block_desc_m_k;
-        }
-        else // ColumnMajor A
-        {
-            // kfold and mpair dimension is not always required.
-            // more dimension in merge_transform increase the difficulty of generating immarg offset
-            // for compiler.
-            constexpr auto M0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I0);
-            constexpr auto M1 = MPerBlock / M0;
-
-            constexpr auto KThreadWrite     = Problem::kBlockSize / M0;
-            constexpr auto K0PerThreadWrite = K0 / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / WarpGemm::kM;
-            constexpr auto K0PerThreadRead  = K0 / KThreadRead;
-
-            constexpr auto kfold =
-                (K1 * M0 * sizeof(ADataType) > 128) ? 1 : 128 / (K1 * M0 * sizeof(ADataType));
-            constexpr auto KThreadReadPerm =
-                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
-                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
-                    : KThreadRead;
-
-            // 1<=mpair<=kN0
-            constexpr auto mpair = (K1 * WarpGemm::kM * sizeof(ADataType) > 128)
-                                       ? 1
-                                       : ((128 / (K1 * WarpGemm::kM * sizeof(ADataType))) > M0
-                                              ? M0
-                                              : 128 / (K1 * WarpGemm::kM * sizeof(ADataType)));
-
-            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
-                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
-                           number<K0PerThreadWrite>{},
-                           number<KThreadReadPerm * M1>{},
-                           number<kfold * M0 / mpair>{},
-                           number<mpair>{},
-                           K1));
-
-            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-                a_lds_block_desc,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_xor_transform(
-                        make_tuple(number<KThreadReadPerm * M1>{}, number<kfold * M0 / mpair>{})),
-                    make_pass_through_transform(number<mpair>{}),
-                    make_pass_through_transform(K1)),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
-
-            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
-                a_lds_block_desc_permuted,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<M1>{})),
-                    make_unmerge_transform(make_tuple(number<kfold>{}, number<M0 / mpair>{})),
-                    make_pass_through_transform(number<mpair>{}),
-                    make_pass_through_transform(K1)),
-                make_tuple(sequence<0>{},
-                           sequence<1>{},
-                           sequence<2>{},
-                           sequence<3>{},
-                           sequence<4>{},
-                           sequence<5>{}),
-                make_tuple(sequence<1>{},
-                           sequence<2>{},
-                           sequence<0, 3>{},
-                           sequence<4, 5>{},
-                           sequence<6>{},
-                           sequence<7>{}));
-
-            constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
-                a_lds_block_desc_unmerged,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(number<KThreadReadPerm>{},
-                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
-                                          number<kfold>{},
-                                          number<K0PerThreadWrite>{},
-                                          K1)),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(number<M0 / mpair>{}, number<mpair>{}, number<M1>{}))),
-                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
-                make_tuple(sequence<1>{}, sequence<0>{}));
-
-            return a_lds_block_desc_m_k;
-        }
+        constexpr index_t KPack     = GetVectorLoadSize<Problem, ADataType, MPerBlock>();
+
+        constexpr auto DataTypeSize = sizeof(ADataType);
+        constexpr auto MLdsLayer =
+            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
+                       number<MPerBlock / MLdsLayer>{},
+                       number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
+                                                     number<KPerBlock / KPack * MLdsLayer>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                           make_tuple(number<KPerBlock / KPack>{}, number<MLdsLayer>{})),
+                       make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1, 2>{}, sequence<0, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
     {
-        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType,
-                                                Problem::BlockGemmShape::WarpTile::at(I0),
-                                                Problem::BlockGemmShape::WarpTile::at(I1),
-                                                Problem::BlockGemmShape::WarpTile::at(I2),
-                                                TransposeC>;
 
         using BDataType = remove_cvref_t<typename Problem::BDataType>;
 
-        using BLayout = remove_cvref_t<typename Problem::BLayout>;
-
         constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr index_t K1 = WarpGemm::kK;
-        constexpr index_t K0 = KPerBlock / K1;
-
-        if constexpr(std::is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-        {
-            // NLdsLayer * K0 as logical Bank
-            constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
-                                           ? 1
-                                           : 32 * 4 / KPerBlock / sizeof(BDataType);
-            ;
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(K0 * number<NLdsLayer>{}, number<NPerBlock / NLdsLayer>{}, K1),
-                make_tuple(K1, number<KPerBlock * NLdsLayer>{}, I1));
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc,
-                make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
-                                                         number<K0 * NLdsLayer>{})),
-                           make_pass_through_transform(K1)),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}));
-
-            constexpr auto b_lds_block_desc_bk0_kNLdsLayer_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(K0, number<NLdsLayer>{})),
-                           make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
-                           make_pass_through_transform(K1)),
-                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
-
-            constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
-                b_lds_block_desc_bk0_kNLdsLayer_n_bk1,
-                make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{}))),
-                make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
-                make_tuple(sequence<1>{}, sequence<0>{}));
-
-            return b_lds_block_desc_n_k;
-        }
-        else // RowMajor B
-        {
-            constexpr auto N0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I1);
-            constexpr auto N1 = NPerBlock / N0;
-
-            constexpr auto KThreadWrite     = Problem::kBlockSize / N0;
-            constexpr auto K0PerThreadWrite = K0 / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / WarpGemm::kN;
-            constexpr auto K0PerThreadRead  = K0 / KThreadRead;
-
-            constexpr auto kfold =
-                (K1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (K1 * N0 * sizeof(BDataType));
-            constexpr auto KThreadReadPerm =
-                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
-                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
-                    : KThreadRead;
-
-            // 1<=npair<=kN0
-            constexpr auto npair = (K1 * WarpGemm::kN * sizeof(BDataType) > 128)
-                                       ? 1
-                                       : ((128 / (K1 * WarpGemm::kN * sizeof(BDataType))) > N0
-                                              ? N0
-                                              : 128 / (K1 * WarpGemm::kN * sizeof(BDataType)));
-
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
-                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
-                           number<K0PerThreadWrite>{},
-                           number<KThreadReadPerm * N1>{},
-                           number<kfold * N0 / npair>{},
-                           number<npair>{},
-                           K1));
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_xor_transform(
-                        make_tuple(number<KThreadReadPerm * N1>{}, number<kfold * N0 / npair>{})),
-                    make_pass_through_transform(number<npair>{}),
-                    make_pass_through_transform(K1)),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
-
-            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<N1>{})),
-                    make_unmerge_transform(make_tuple(number<kfold>{}, number<N0 / npair>{})),
-                    make_pass_through_transform(number<npair>{}),
-                    make_pass_through_transform(K1)),
-                make_tuple(sequence<0>{},
-                           sequence<1>{},
-                           sequence<2>{},
-                           sequence<3>{},
-                           sequence<4>{},
-                           sequence<5>{}),
-                make_tuple(sequence<1>{},
-                           sequence<2>{},
-                           sequence<0, 3>{},
-                           sequence<4, 5>{},
-                           sequence<6>{},
-                           sequence<7>{}));
-
-            constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
-                b_lds_block_desc_unmerged,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(number<KThreadReadPerm>{},
-                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
-                                          number<kfold>{},
-                                          number<K0PerThreadWrite>{},
-                                          K1)),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{}))),
-                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
-                make_tuple(sequence<1>{}, sequence<0>{}));
-
-            return b_lds_block_desc_n_k;
-        }
+        constexpr index_t KPack     = GetVectorLoadSize<Problem, BDataType, NPerBlock>();
+
+        constexpr auto DataTypeSize = sizeof(BDataType);
+        constexpr auto NLdsLayer =
+            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack * NLdsLayer>{},
+                       number<NPerBlock / NLdsLayer>{},
+                       number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock * NLdsLayer>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+            b_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
+                                                     number<KPerBlock / KPack * NLdsLayer>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto b_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            b_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                           make_tuple(number<KPerBlock / KPack>{}, number<NLdsLayer>{})),
+                       make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+            b_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1, 2>{}, sequence<0, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        return b_lds_block_desc;
     }
 
     template <typename Problem>
@@ -330,20 +177,6 @@ struct UniversalGemmPipelineAgBgCrPolicy
         return smem_size;
     }
 
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
-    {
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
-        return Problem::VectorLoadSize / sizeof(ADataType);
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB()
-    {
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-        return Problem::VectorLoadSize / sizeof(BDataType);
-    }
-
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
     {
@@ -362,7 +195,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
             constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
             static_assert(total_pixels % M1 == 0);
             constexpr index_t K3    = total_pixels / M1;
-            constexpr index_t KPack = GetSmemPackA<Problem>();
+            constexpr index_t KPack = GetVectorLoadSize<Problem, ADataType, MPerBlock>();
             static_assert(KPack % K3 == 0);
             constexpr index_t K2 = KPack / K3;
             if constexpr(get_warp_size() % (K2 * M0) == 0)
@@ -445,7 +278,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
             constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
             static_assert(total_pixels % N1 == 0);
             constexpr index_t K3    = total_pixels / N1;
-            constexpr index_t KPack = GetSmemPackB<Problem>();
+            constexpr index_t KPack = GetVectorLoadSize<Problem, BDataType, NPerBlock>();
             static_assert(KPack % K3 == 0);
             constexpr index_t K2 = KPack / K3;
             if constexpr(get_warp_size() % (K2 * N0) == 0)
@@ -530,7 +363,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
         constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
         static_assert(total_pixels % M1 == 0);
         constexpr index_t K3     = total_pixels / M1;
-        constexpr index_t kKPack = GetSmemPackB<Problem>();
+        constexpr index_t kKPack = GetVectorLoadSize<Problem, ADataType, MPerBlock>();
         static_assert(kKPack % K3 == 0);
         constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
         constexpr index_t warp_size = get_warp_size();
@@ -578,7 +411,7 @@ struct UniversalGemmPipelineAgBgCrPolicy
         constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
         static_assert(total_pixels % N1 == 0);
         constexpr index_t K3     = total_pixels / N1;
-        constexpr index_t kKPack = GetSmemPackB<Problem>();
+        constexpr index_t kKPack = GetVectorLoadSize<Problem, BDataType, NPerBlock>();
         static_assert(kKPack % K3 == 0);
         constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
         constexpr index_t warp_size = get_warp_size();
-- 
GitLab


From 73f02a108347d626ee9b31789f0ff8b26ef87006 Mon Sep 17 00:00:00 2001
From: Taylor Ding <taylding@amd.com>
Date: Wed, 13 Nov 2024 11:20:38 -0500
Subject: [PATCH 054/153] Move checks for compatibility from Argument() to
 IsSupportedArgument() (#1653)

---
 ..._grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
index 6bb5d431c..17b7d962d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -381,10 +381,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                         {
                             tildes = {i_ztilde, i_ytilde, i_xtilde};
                         }
-                        else
-                        {
-                            throw std::runtime_error("wrong! only implemented for 2D and 3D now");
-                        }
 
                         const auto a_grid_desc_ak0_m_ak1 =
                             transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
@@ -749,6 +745,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                 return false;
             }
         }
+        
+        // check number of dimension, only implemented for 2D and 3D now
+        if(NDimSpatial != 2 && NDimSpatial != 3)
+        {
+            return false;
+        }
 
         return true;
     }
-- 
GitLab


From efd92615459c83d1af3f226f846b395323374a74 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 13 Nov 2024 09:20:18 -0800
Subject: [PATCH 055/153] fix clang format (#1662)

---
 .../device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
index 17b7d962d..3fb047f20 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -745,7 +745,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                 return false;
             }
         }
-        
+
         // check number of dimension, only implemented for 2D and 3D now
         if(NDimSpatial != 2 && NDimSpatial != 3)
         {
-- 
GitLab


From c1f8d53ce83c6ca6d15fec8d987974bc05008c16 Mon Sep 17 00:00:00 2001
From: feli <felix.li@amd.com>
Date: Thu, 14 Nov 2024 14:06:36 +0800
Subject: [PATCH 056/153] [Ck_tile] hot fix, fix rpcf param setting err (#1657)

Co-authored-by: dummycoderfe <noplydummmycoder@163.com>
---
 .../pipeline/layernorm2d_fwd_pipeline_one_pass.hpp |  2 +-
 .../pipeline/layernorm2d_fwd_pipeline_two_pass.hpp | 14 +++++++++++---
 .../ck_tile/ops/welford/block/block_welford.hpp    | 13 +++++++++----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index 4b83ed4fb..eefdaf917 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -121,7 +121,7 @@ struct Layernorm2dFwdPipelineOnePass
         auto [mean, var] = block_welford(acc, cur_count, max_count);
         block_welford_sync(mean, var, cur_count);
         block_welford_cross_warp_sync(mean, var, cur_count, smem);
-        block_tile_welford_post_scale_var(var, cur_count);
+        block_tile_welford_post_scale_var(var, cur_count, constant<kFastFDiv>{});
 
         // compute inv-std
         auto inv_std = tile_elementwise_in(
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index fadf56dfd..6a86cc43c 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -35,6 +35,7 @@ struct Layernorm2dFwdPipelineTwoPass
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
     static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
     static constexpr bool kPadN              = Problem::Traits::kPadN;
+    static constexpr bool kFastFDiv          = Problem::Traits::kFastFDiv;
     static constexpr auto kFusedAdd          = Problem::Traits::kFusedAdd;
     static constexpr auto kFusedQuant        = Problem::Traits::kFusedQuant;
 
@@ -137,15 +138,22 @@ struct Layernorm2dFwdPipelineTwoPass
 
         block_welford_sync(mean, var, cur_count);
         block_welford_cross_warp_sync(mean, var, cur_count, smem);
-        block_tile_welford_post_scale_var(var, cur_count);
+        block_tile_welford_post_scale_var(var, cur_count, constant<kFastFDiv>{});
 
         // compute inv-std
         auto inv_std = tile_elementwise_in(
             [&](const auto& v_) {
-                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ + epsilon));
+                if(kFastFDiv && std::is_same_v<ComputeDataType, float>)
+                {
+                    return type_convert<ComputeDataType>(1.0f) *
+                           __builtin_amdgcn_rcpf(sqrt(v_ + epsilon));
+                }
+                else
+                {
+                    return type_convert<ComputeDataType>(1.0f) / sqrt(v_ + epsilon);
+                }
             },
             var);
-
         if constexpr(kSaveMean)
             store_tile(mean_window, cast_tile<MeanDataType>(mean));
         if constexpr(kSaveInvStd)
diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp
index 968895e38..56ca86d9d 100644
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
@@ -47,8 +47,11 @@ struct BlockWelford
 
                     auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
 
-                    welford_update(
-                        mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x, cur_count_);
+                    welford_update(mean_tensor(out_dstr_idx),
+                                   var_tensor(out_dstr_idx),
+                                   x,
+                                   cur_count_,
+                                   constant<kFastFDiv>{});
                 });
             }
         });
@@ -159,7 +162,8 @@ struct BlockWelfordSync
                                       v_local_count,
                                       v_remote_mean,
                                       v_remote_var,
-                                      v_remote_count);
+                                      v_remote_count,
+                                      constant<kFastFDiv>{});
                     });
                 }
             });
@@ -307,7 +311,8 @@ struct BlockWelfordCrossWarpSync
                               v_local_count,
                               v_remote_mean,
                               v_remote_var,
-                              v_remote_count);
+                              v_remote_count,
+                              constant<kFastFDiv>{});
             });
 
             mean_tensor.get_thread_buffer()(i_0) = v_local_mean;
-- 
GitLab


From d805a461aae7454de448bc0305cce01192fbc198 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Thu, 14 Nov 2024 09:40:50 -0700
Subject: [PATCH 057/153] Fix example_convnd_fwd_max_xdl_int8 failures on MI300
 (#1666)

* Improve test verbosity.

* BUGFIX: Add missing initialization for reduction buffer

* Change default initialization method

Performance may be affected for fp32 and int8 examples.

* Improve test verbosity

* Cleanup
---
 .../common.hpp                                |  2 +-
 .../run_convnd_fwd_max_example.inc            | 57 +++++++++++++------
 .../gemm_add_add_mean_meansquare_xdl_fp16.cpp |  2 +-
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
index 7e3130a1a..036f288d0 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -80,7 +80,7 @@ using RLayout = typename LayoutSettingSelector<NDimSpatial>::RLayout;
 struct ExecutionConfig final
 {
     bool do_verification = true;
-    int init_method      = 1;
+    int init_method      = 2;
     bool time_kernel     = false;
 };
 
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
index cebfeb51d..d61aee81a 100644
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -73,16 +73,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
     Tensor<EDataType> conv_output_device(conv_output_g_n_k_wos_desc);
     Tensor<R0DataType> r0_device(r0_desc);
 
+    std::cout << "input: " << conv_input.mDesc << std::endl;
+    std::cout << "weight: " << conv_weight.mDesc << std::endl;
+    std::cout << "output: " << conv_output_device.mDesc << std::endl;
+    std::cout << "reduction: " << r0_device.mDesc << std::endl << std::endl;
+
     switch(config.init_method)
     {
     case 0: break;
     case 1:
         ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-1, 1}(conv_weight);
+        break;
+    case 2:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
+        ck::utils::FillUniformDistribution<BDataType>{-1, 1}(conv_weight);
         break;
     default:
-        ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
-        ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
+        ck::utils::FillUniformDistribution<ADataType>{-8, 7}(conv_input);
+        ck::utils::FillUniformDistribution<BDataType>{-1, 1}(conv_weight);
     }
 
     DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
@@ -161,15 +170,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
         return false;
     }
 
+    // XXX: DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle will not initialize r0.
+    r0_device_buf.SetValue(ck::NumericLimits<R0DataType>::Lowest());
+
     const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
 
-    const std::size_t flop      = problem_size.GetFlops();
-    const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
+    if(config.time_kernel)
+    {
+        const std::size_t flop      = problem_size.GetFlops();
+        const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
 
-    const float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-    const float gb_per_sec = num_btype / 1.E6 / avg_time;
-    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << conv.GetTypeString() << std::endl;
+        const float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+        const float gb_per_sec = num_btype / 1.E6 / avg_time;
+        std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << conv.GetTypeString() << std::endl;
+    }
+    else
+    {
+        std::cout << "FINISHED: " << conv.GetTypeString() << std::endl;
+    }
 
     if(config.do_verification)
     {
@@ -189,6 +208,7 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
                                                   BElementOp{},
                                                   PassThrough{});
 
+        std::cout << "\nRunning verification on CPU." << std::endl;
         ref_invoker.Run(ref_argument);
 
         Tensor<R0DataType> r0_host(r0_device.mDesc);
@@ -273,13 +293,18 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
         conv_output_device_buf.FromDevice(conv_output_device.mData.data());
         r0_device_buf.FromDevice(r0_device.mData.data());
 
-        return ck::utils::check_err(conv_output_device,
-                                    conv_output_host,
-                                    "Error: incorrect results! (Matrix E)",
-                                    1e-5f,
-                                    1e-4f) &&
-               ck::utils::check_err(
-                   r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
+        auto pass = ck::utils::check_err(conv_output_device,
+                                         conv_output_host,
+                                         "Error: incorrect results! (Matrix E)",
+                                         1e-3f,
+                                         1e-3f);
+        pass =
+            pass && ck::utils::check_err(
+                        r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-3f, 1e-3f);
+        if(pass)
+            std::cout << "Verification on CPU: PASS" << std::endl;
+
+        return pass;
     }
 
     return true;
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
index 2f6533d44..a46eaa481 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -198,7 +198,7 @@ int main()
         throw std::runtime_error("wrong! this device_op instance does not support this problem");
     }
 
-    // init reducetion buffer to 0
+    // init reduction buffer to 0
     r0_device_buf.SetZero();
     r1_device_buf.SetZero();
 
-- 
GitLab


From 3b6a481e92d8ba2a9f9e87136678b05bcaf573a7 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 14 Nov 2024 16:14:50 -0800
Subject: [PATCH 058/153] re-enable coerce-illegal-types flag for rocm6.3
 (#1668)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd2f60683..4bb69300a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,7 +221,7 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
 endif()
 set(check-coerce)
 check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
-if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
+if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132)
    message("Adding the amdgpu-coerce-illegal-types=1")
    add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
 endif()
-- 
GitLab


From b4a79045829b07f7e80603fb773c196e1f7a7214 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 14 Nov 2024 16:15:01 -0800
Subject: [PATCH 059/153] re-enable fp8 gemms in ckProfiler (#1667)

---
 CMakeLists.txt                                  | 6 ++++--
 profiler/src/profile_gemm_universal.cpp         | 6 +++---
 test/gemm_universal/test_gemm_universal_xdl.cpp | 4 ++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4bb69300a..b28a6d912 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,12 +183,14 @@ message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     message("Enabling XDL instances")
     add_definitions(-DCK_USE_XDL)
-    set(CK_USE_XDL "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+    message("Enabling FP8 gemms in ckProfiler")
+    add_definitions(-DCK_USE_GFX94)
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     message("Enabling WMMA instances")
     add_definitions(-DCK_USE_WMMA)
-    set(CK_USE_WMMA "ON")
 endif()
 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
 if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index 576bd009b..990cbd292 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -101,7 +101,7 @@ int profile_gemm_universal(int argc, char* argv[])
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using F8 = ck::f8_t;
 #endif
 
@@ -164,7 +164,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
@@ -198,7 +198,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
diff --git a/test/gemm_universal/test_gemm_universal_xdl.cpp b/test/gemm_universal/test_gemm_universal_xdl.cpp
index 23b5c74dd..b872d7089 100644
--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl.cpp
@@ -56,7 +56,7 @@ class TestGemmUniversal_KM_NK
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
     std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
     std::tuple<      F16,        F8,             F16,     F16>,
     std::tuple<       F8,       F16,             F16,     F16>,
     std::tuple<       F8,        F8,              F8,    BF16>,
@@ -66,7 +66,7 @@ using KernelTypes_MK_KN = ::testing::Types<
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
     std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
     std::tuple<      F16,        F8,             F16,     F16>,
     std::tuple<       F8,       F16,             F16,     F16>,
     std::tuple<       F8,        F8,              F8,    BF16>,
-- 
GitLab


From efb34741fe1f6af938e32b80fa5a30211d8dd71c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 15 Nov 2024 18:30:58 -0500
Subject: [PATCH 060/153] Bump rocm-docs-core from 1.8.3 to 1.8.4 in
 /docs/sphinx (#1670)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.3 to 1.8.4.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.8.4/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.3...v1.8.4)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index c2220e15d..9824df626 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.8.3
+rocm-docs-core==1.8.4
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 0dc2e70c5..f89fbcf27 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.8.3
+rocm-docs-core==1.8.4
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From 754adc70e3c98c08dc64f7338d8a2e5e5f38dc3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 18 Nov 2024 14:03:45 +0100
Subject: [PATCH 061/153] Batched GEMM Multiple D based on Universal GEMM
 (#1655)

* Batched GEMM Multiple D based on Universal GEMM

Co-authored-by: Jing Zhang <jizhan@fb.com>

* CI fixes

Co-authored-by: Jing Zhang <jizhan@fb.com>

---------

Co-authored-by: Jing Zhang <jizhan@fb.com>
---
 example/24_batched_gemm/CMakeLists.txt        |    6 +
 .../batched_gemm_xdl_bf16_v3.cpp              |   99 ++
 .../batched_gemm_xdl_fp8_rowwise_v3.cpp       |  106 ++
 .../run_batched_gemm_example.inc              |   36 +-
 .../run_batched_gemm_example_rowwise.inc      |  280 +++++
 .../device/device_batched_gemm_multi_d.hpp    |   43 +-
 ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp | 1014 +++++++++++++++++
 .../gpu/gemm_universal_batched.hpp            |  185 +++
 .../gpu/CMakeLists.txt                        |    9 +
 .../gpu/gemm_universal_batched/CMakeLists.txt |   19 +
 ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp |   95 ++
 ...16_bf16_mk_nk_mn_comp_default_instance.cpp |   32 +
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |   33 +
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |   33 +
 ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp |  109 ++
 ...f8_bf16_mk_nk_mn_comp_default_instance.cpp |   32 +
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |   33 +
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |   33 +
 .../profile_gemm_universal_batched_impl.hpp   |  280 +++++
 profiler/src/CMakeLists.txt                   |    2 +
 .../src/profile_gemm_universal_batched.cpp    |  187 +++
 21 files changed, 2655 insertions(+), 11 deletions(-)
 create mode 100644 example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
 create mode 100644 example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
 create mode 100644 example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
 create mode 100644 profiler/src/profile_gemm_universal_batched.cpp

diff --git a/example/24_batched_gemm/CMakeLists.txt b/example/24_batched_gemm/CMakeLists.txt
index 4cb45be7c..720af39af 100644
--- a/example/24_batched_gemm/CMakeLists.txt
+++ b/example/24_batched_gemm/CMakeLists.txt
@@ -9,6 +9,12 @@ add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16)
 add_example_executable(example_batched_gemm_xdl_bf16 batched_gemm_xdl_bf16.cpp)
 add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16)
 
+add_example_executable(example_batched_gemm_xdl_bf16_v3 batched_gemm_xdl_bf16_v3.cpp)
+add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16_v3)
+
+add_example_executable(example_batched_gemm_xdl_fp8_rowwise_v3 batched_gemm_xdl_fp8_rowwise_v3.cpp)
+add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp8_rowwise_v3)
+
 add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp)
 add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int8)
 
diff --git a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
new file mode 100644
index 000000000..fa8b75218
--- /dev/null
+++ b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    EDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmDefault,
+    256,            // BlockSize
+    256,            // MPerBlock
+    128,            // NPerBlock
+    32,             // KPerBlock
+    8,              // AK1
+    8,              // BK1
+    32,             // MPerXDL
+    32,             // NPerXDL
+    4,              // MXdlPerWave
+    2,              // NXdlPerWave
+    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+    2,              // ABlockTransferSrcVectorDim
+    8,              // ABlockTransferSrcScalarPerVector
+    8,              // ABlockTransferDstScalarPerVector_AK1
+    1,              // ABlockLdsExtraM
+    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+    2,              // BBlockTransferSrcVectorDim
+    8,              // BBlockTransferSrcScalarPerVector
+    8,              // BBlockTransferDstScalarPerVector_BK1
+    1,              // BBlockLdsExtraN
+    1,              // CShuffleMXdlPerWavePerShuffle
+    1,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    S<8>,           // CDEShuffleBlockTransferScalarPerVectors
+    ck::BlockGemmPipelineScheduler::Intrawave, // BlockGemmPipelineScheduler
+    ck::BlockGemmPipelineVersion::v3           // BlockGemmPipelineVersion
+    >;
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
new file mode 100644
index 000000000..f0160b31c
--- /dev/null
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F8   = ck::f8_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough      = ck::tensor_operation::element_wise::PassThrough;
+using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply;
+
+using ADataType        = F8;
+using BDataType        = F8;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Col;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MultiplyMultiply;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    EDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmDefault,
+    256,            // BlockSize
+    256,            // MPerBlock
+    128,            // NPerBlock
+    32,             // KPerBlock
+    8,              // AK1
+    8,              // BK1
+    32,             // MPerXDL
+    32,             // NPerXDL
+    4,              // MXdlPerWave
+    2,              // NXdlPerWave
+    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+    2,              // ABlockTransferSrcVectorDim
+    8,              // ABlockTransferSrcScalarPerVector
+    8,              // ABlockTransferDstScalarPerVector_AK1
+    1,              // ABlockLdsExtraM
+    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+    2,              // BBlockTransferSrcVectorDim
+    8,              // BBlockTransferSrcScalarPerVector
+    8,              // BBlockTransferDstScalarPerVector_BK1
+    1,              // BBlockLdsExtraN
+    1,              // CShuffleMXdlPerWavePerShuffle
+    1,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    S<8, 8, 1>,     // CDEShuffleBlockTransferScalarPerVectors
+    ck::BlockGemmPipelineScheduler::Interwave, // BlockGemmPipelineScheduler
+    ck::BlockGemmPipelineVersion::v1,          // BlockGemmPipelineVersion
+    F8                                         // ComputeTypeA
+    >;
+
+#include "run_batched_gemm_example_rowwise.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_rowwise_example(argc, argv); }
diff --git a/example/24_batched_gemm/run_batched_gemm_example.inc b/example/24_batched_gemm/run_batched_gemm_example.inc
index 21934add3..741512bf0 100644
--- a/example/24_batched_gemm/run_batched_gemm_example.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example.inc
@@ -210,17 +210,9 @@ bool run_batched_gemm_example(int argc, char* argv[])
 
     problem_size.M = 256 * (dis(gen) + 1);
     problem_size.N = 128 * (dis(gen) + 1);
-    problem_size.K = 64 * (dis(gen) + 2);
+    problem_size.K = 128 * (dis(gen) + 2);
 
-    problem_size.stride_A = problem_size.K;
-    problem_size.stride_B = problem_size.K;
-    problem_size.stride_C = problem_size.N;
-
-    problem_size.batch_stride_A = problem_size.M * problem_size.K;
-    problem_size.batch_stride_B = problem_size.K * problem_size.N;
-    problem_size.batch_stride_C = problem_size.M * problem_size.N;
-
-    problem_size.batch_count = 16;
+    problem_size.batch_count = 2;
 
     if(argc == 4)
     {
@@ -228,13 +220,37 @@ bool run_batched_gemm_example(int argc, char* argv[])
         config.init_method     = std::stoi(argv[2]);
         config.time_kernel     = std::stoi(argv[3]);
     }
+    else if(argc == 8)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        problem_size.M           = std::stoi(argv[4]);
+        problem_size.N           = std::stoi(argv[5]);
+        problem_size.K           = std::stoi(argv[6]);
+        problem_size.batch_count = std::stoi(argv[7]);
+    }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
         printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("optinal\n");
+        printf("arg4-7: M = %d N = %d K = %d Batch = %d\n",
+               problem_size.M,
+               problem_size.N,
+               problem_size.K,
+               problem_size.batch_count);
         exit(0);
     }
 
+    problem_size.stride_A = problem_size.K;
+    problem_size.stride_B = problem_size.K;
+    problem_size.stride_C = problem_size.N;
+
+    problem_size.batch_stride_A = problem_size.M * problem_size.K;
+    problem_size.batch_stride_B = problem_size.K * problem_size.N;
+    problem_size.batch_stride_C = problem_size.M * problem_size.N;
+
     return run_batched_gemm(problem_size, config);
 }
diff --git a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
new file mode 100644
index 000000000..778be8ffd
--- /dev/null
+++ b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <random>
+
+#pragma once
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+    ck::index_t stride_C = N;
+
+    ck::index_t stride_D0 = 0;
+    ck::index_t stride_D1 = 0;
+
+    ck::index_t batch_stride_A = M * K;
+    ck::index_t batch_stride_B = K * N;
+    ck::index_t batch_stride_C = M * N;
+
+    ck::index_t batch_stride_D0 = N;
+    ck::index_t batch_stride_D1 = M;
+
+    ck::index_t batch_count = 16;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M,
+           N,
+           K,
+           stride_A,
+           stride_B,
+           stride_C,
+           stride_D0,
+           stride_D1,
+           batch_stride_A,
+           batch_stride_B,
+           batch_stride_C,
+           batch_stride_D0,
+           batch_stride_D1,
+           batch_count] = problem_size;
+
+    // GEMM shape
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+    Tensor<D0DataType> d0_g_m_n(
+        f_host_tensor_descriptor(batch_count, M, N, stride_D0, batch_stride_D0, D0Layout{}));
+    Tensor<D1DataType> d1_g_m_n(
+        f_host_tensor_descriptor(batch_count, M, N, stride_D1, batch_stride_D1, D1Layout{}));
+    Tensor<EDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "d0_g_m_n: " << d0_g_m_n.mDesc << std::endl;
+    std::cout << "d1_g_m_n: " << d1_g_m_n.mDesc << std::endl;
+    std::cout << "e_g_m_n: " << e_g_m_n_device_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    d0_g_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+    d1_g_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+
+    d0_device_buf.ToDevice(d0_g_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_g_m_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    // do GEMM
+    auto argument =
+        gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                          b_device_buf.GetDeviceBuffer(),
+                          {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                          c_device_buf.GetDeviceBuffer(),
+                          M,
+                          N,
+                          K,
+                          batch_count,
+                          stride_A,
+                          stride_B,
+                          {stride_D0, stride_D1},
+                          stride_C,
+                          batch_stride_A,
+                          batch_stride_B,
+                          {batch_stride_D0, batch_stride_D1},
+                          batch_stride_C,
+                          a_element_op,
+                          b_element_op,
+                          cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool pass = true;
+
+    if(config.do_verification)
+    {
+        c_device_buf.FromDevice(e_g_m_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_g_m_n({batch_count, M, N});
+
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             CShuffleDataType,
+                                                             AccDataType,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             PassThrough>;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        Tensor<EDataType> e_g_m_n_host_result(
+            f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int b = 0; b < batch_count; ++b)
+        {
+            for(int m = 0; m < M; ++m)
+            {
+                for(int n = 0; n < N; ++n)
+                {
+                    cde_element_op(e_g_m_n_host_result(b, m, n),
+                                   c_g_m_n(b, m, n),
+                                   d0_g_m_n(b, m, n),
+                                   d1_g_m_n(b, m, n));
+                }
+            }
+        }
+
+        pass = ck::utils::check_err(
+            e_g_m_n_device_result, e_g_m_n_host_result, "Error: Incorrect results c");
+    }
+
+    if(config.time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
+        std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
+                                sizeof(BDataType) * batch_count * K * N +
+                                sizeof(EDataType) * batch_count * M * N;
+
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass ? 0 : 1;
+}
+
+bool run_batched_gemm_rowwise_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    std::mt19937 gen(11939);
+    std::uniform_int_distribution<int> dis(0, 15);
+
+    problem_size.M = 256 * (dis(gen) + 1);
+    problem_size.N = 128 * (dis(gen) + 1);
+    problem_size.K = 128 * (dis(gen) + 2);
+
+    problem_size.batch_count = 2;
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 8)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        problem_size.M           = std::stoi(argv[4]);
+        problem_size.N           = std::stoi(argv[5]);
+        problem_size.K           = std::stoi(argv[6]);
+        problem_size.batch_count = std::stoi(argv[7]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("optinal\n");
+        printf("arg4-7: M = %d N = %d K = %d Batch = %d\n",
+               problem_size.M,
+               problem_size.N,
+               problem_size.K,
+               problem_size.batch_count);
+        exit(0);
+    }
+
+    problem_size.stride_A = problem_size.K;
+    problem_size.stride_B = problem_size.K;
+    problem_size.stride_C = problem_size.N;
+
+    problem_size.stride_D0 = 0;
+    problem_size.stride_D1 = 0;
+
+    problem_size.batch_stride_A = problem_size.M * problem_size.K;
+    problem_size.batch_stride_B = problem_size.K * problem_size.N;
+    problem_size.batch_stride_C = problem_size.M * problem_size.N;
+
+    problem_size.batch_stride_D0 = problem_size.N;
+    problem_size.batch_stride_D1 = problem_size.M;
+
+    return run_batched_gemm_rowwise(problem_size, config);
+}
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
index f18dc3290..58c0288e8 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -53,6 +53,47 @@ struct DeviceBatchedGemmMultiD : public BaseOperator
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
 
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmV2MultiD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
new file mode 100644
index 000000000..314ecdf76
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -0,0 +1,1014 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+template <typename GridwiseGemm,
+          typename BatchedGemmArg,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t g_idx = blockIdx.z % karg.Batch;
+
+    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+
+    // populate pointer, desc for Ds
+    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+        // D pointer
+        karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i];
+    });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + a_batch_offset,
+        karg.p_b_grid + b_batch_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid + c_batch_offset,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          typename BatchedGemmArg,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t g_idx = blockIdx.z % karg.Batch;
+
+    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+
+    // populate pointer, desc for Ds
+    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+        // D pointer
+        karg.p_ds_grid(i) = karg.p_ds_grid(i) + ds_batch_offset[i];
+    });
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + a_batch_offset,
+        karg.p_b_grid + b_batch_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid + c_batch_offset,
+        p_shared_0,
+        p_shared_1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = ADataType,
+          typename ComputeTypeB                       = BDataType,
+          typename LDSTypeA                           = ComputeTypeA,
+          typename LDSTypeB                           = ComputeTypeB>
+struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
+    : public DeviceBatchedGemmV2MultiD<ALayout,
+                                       BLayout,
+                                       DsLayout,
+                                       CLayout,
+                                       ADataType,
+                                       BDataType,
+                                       DsDataType,
+                                       CDataType,
+                                       AElementwiseOperation,
+                                       BElementwiseOperation,
+                                       CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3<
+        ALayout,
+        BLayout,
+        DsLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        LDSTypeA,
+        LDSTypeB>;
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       std::array<ck::index_t, NumDTensor> BatchStrideDs,
+                                       index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideDs_(BatchStrideDs),
+              BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(BatchStrideA_) * g_idx;
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(BatchStrideB_) * g_idx;
+        }
+
+        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+        {
+            std::array<long_index_t, NumDTensor> ds_offset_;
+
+            static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
+                ds_offset_[i] = static_cast<long_index_t>(BatchStrideDs_[i]) * g_idx;
+            });
+
+            return ds_offset_;
+        }
+
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(BatchStrideC_) * g_idx;
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        const std::array<ck::index_t, NumDTensor> BatchStrideDs_;
+        index_t BatchStrideC_;
+    };
+
+    struct Argument : public GridwiseGemm::Argument
+    {
+        index_t Batch;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
+
+        Argument(const ADataType* p_a_grid_,
+                 const BDataType* p_b_grid_,
+                 std::array<const void*, NumDTensor> p_ds_grid_,
+                 CDataType* p_e_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 std::array<index_t, NumDTensor> StrideDs_,
+                 index_t StrideE_,
+                 index_t BatchStrideA_,
+                 index_t BatchStrideB_,
+                 const std::array<ck::index_t, NumDTensor>& BatchStrideDs_,
+                 index_t BatchStrideE_,
+                 index_t Batch_,
+                 AElementwiseOperation a_element_op_,
+                 BElementwiseOperation b_element_op_,
+                 CElementwiseOperation c_element_op_)
+            : GridwiseGemm::Argument{p_a_grid_,
+                                     p_b_grid_,
+                                     p_ds_grid_,
+                                     p_e_grid_,
+                                     M_,
+                                     N_,
+                                     K_,
+                                     StrideA_,
+                                     StrideB_,
+                                     StrideDs_,
+                                     StrideE_,
+                                     1,
+                                     a_element_op_,
+                                     b_element_op_,
+                                     c_element_op_},
+              Batch{Batch_},
+              compute_ptr_offset_of_batch{
+                  BatchStrideA_, BatchStrideB_, BatchStrideDs_, BatchStrideE_}
+        {
+        }
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg) || arg.KBatch > 1)
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+
+                    std::array<std::size_t, NumDTensor> DsSize;
+
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType) * arg.Batch;
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType) * arg.Batch;
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                            GridwiseGemm,
+                            Argument,
+                            true,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                            GridwiseGemm,
+                            Argument,
+                            true,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                    GridwiseGemm,
+                                    Argument,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                                GridwiseGemm,
+                                Argument,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                            GridwiseGemm,
+                            Argument,
+                            false,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                            GridwiseGemm,
+                            Argument,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                             index_t BatchStrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        p_ds,
+                        static_cast<CDataType*>(p_e),
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideE,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideDs,
+                        BatchStrideE,
+                        Batch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_e),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideDs,
+                                          BatchStrideE,
+                                          Batch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceBatchedGemmXdlUniversal"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp
new file mode 100644
index 000000000..16c4d792d
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_BF16
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP8
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatchedGemmV2MultiD<
+    ALayout,
+    BLayout,
+    DsLayout,
+    CLayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    CDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceBatchedGemmV2MultiD<ALayout,
+                                               BLayout,
+                                               DsLayout,
+                                               CLayout,
+                                               ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               CDataType,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
+#ifdef CK_ENABLE_FP8
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 80f0fc306..6a1558a52 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -81,6 +81,12 @@ function(add_instance_library INSTANCE_NAME)
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
+    foreach(source IN LISTS ARGN)
+    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "batched_gemm_xdl_universal" AND source MATCHES "_f8_")
+         message("removing batched_gemm_universal_f8 instance ${source} ")
+         list(REMOVE_ITEM ARGN "${source}")
+    endif()
+    endforeach()
     endif()
     #only continue if there are some source files left on the list
     if(ARGN)
@@ -102,6 +108,9 @@ function(add_instance_library INSTANCE_NAME)
                 if(source MATCHES "gemm_multiply_multiply_f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
+                if(source MATCHES "bached_gemm_multiply_multiply_f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                endif()
             endif()
             set(offload_targets)
             foreach(target IN LISTS INST_TARGETS)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt
new file mode 100644
index 000000000..1affa12bb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt
@@ -0,0 +1,19 @@
+# ONLY XDL_KERNELS
+set(GEMM_UNIVERSAL_BATCHED_INSTANCES)
+
+list(APPEND GEMM_UNIVERSAL_BATCHED_INSTANCES 
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+
+        device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+    )
+
+
+set_source_files_properties(device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+
+add_instance_library(device_gemm_universal_batched_instance ${GEMM_UNIVERSAL_BATCHED_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
new file mode 100644
index 000000000..5db041de0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMKPadding  = GemmSpecialization::MKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout| DsLayout| CLayout|    AData|  BData|     DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##################################|        |        |         |        |     Type|   Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##################################|        |        |         |        |         |       |           |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##################################|        |        |         |        |         |       |           |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched,
+          GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //##################################| ALayout| BLayout| DsLayout| CLayout|   AData|    BData|     DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //##################################|        |        |         |        |    Type|     Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //##################################|        |        |         |        |        |         |           |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //##################################|        |        |         |        |        |         |           |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly 
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,             S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   4,   4,  16,   16,    4,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,             S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,             S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,             S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     BF16,   BF16, DsDataType,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 000000000..12aa7c380
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100644
index 000000000..1dbf5f3d1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..f532309a5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100644
index 000000000..355dc3212
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
+// clang-format off
+        //##################################| ALayout| BLayout| DsLayout| CLayout|AData| BData|       DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##################################|        |        |         |        | Type|  Type|         Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##################################|        |        |         |        |     |      |             |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##################################|        |        |         |        |     |      |             |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#ifdef __gfx94__
+        // Compute friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,  16,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched,
+          GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
+// clang-format off
+        //##################################| ALayout| BLayout| DsLayout| CLayout|AData| BData|       DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //##################################|        |        |         |        | Type|  Type|         Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //##################################|        |        |         |        |     |      |             |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //##################################|        |        |         |        |     |      |             |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    128, 16,  16,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,     64, 16,  16,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    128, 16,  16,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    128, 16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 000000000..7f19a0112
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100644
index 000000000..4489a974b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..afbc9afb9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
new file mode 100644
index 000000000..53f81162a
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
+          typename DeviceOp>
+bool profile_gemm_universal_batched_impl(int do_verification,
+                                         int init_method,
+                                         bool do_log,
+                                         bool time_kernel,
+                                         int M,
+                                         int N,
+                                         int K,
+                                         int BatchStrideA,
+                                         int BatchStrideB,
+                                         int BatchStrideC,
+                                         int StrideA,
+                                         int StrideB,
+                                         int StrideC,
+                                         int BatchCount,
+                                         int n_warmup,
+                                         int n_iter,
+                                         uint64_t rotating = 0)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{}));
+    Tensor<CDataType> c_g_m_n_host_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_n_device_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+
+    int total_gemm_needed =
+        a_g_m_k.GetElementSpaceSizeInBytes() + b_g_k_n.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    if(do_verification)
+    {
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             CDataType,
+                                                             float,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             CElementOp>;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+    c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
+        // false branch for multi d dl kernel
+
+        argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                        {},
+                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                        M,
+                                        N,
+                                        K,
+                                        BatchCount,
+                                        StrideA,
+                                        StrideB,
+                                        {},
+                                        StrideC,
+                                        BatchStrideA,
+                                        BatchStrideB,
+                                        {},
+                                        BatchStrideC,
+                                        ck::tensor_operation::element_wise::PassThrough{},
+                                        ck::tensor_operation::element_wise::PassThrough{},
+                                        ck::tensor_operation::element_wise::PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time = invoker_ptr->Run(
+                argument_ptr.get(),
+                StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count});
+
+            std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
+
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                     sizeof(CDataType) * M * N) *
+                                    BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " B = " << BatchCount << " M = " << M << " N = " << N << " K = " << K
+              << " StrideA = " << StrideA << " StrideB = " << StrideB << " StrideC = " << StrideC
+              << ": " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 7d4df3cf9..f079d554b 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -59,6 +59,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
   list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
@@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   endif()
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
diff --git a/profiler/src/profile_gemm_universal_batched.cpp b/profiler/src/profile_gemm_universal_batched.cpp
new file mode 100644
index 000000000..4afef8e55
--- /dev/null
+++ b/profiler/src/profile_gemm_universal_batched.cpp
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_universal_batched_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    BF16_BF16_BF16, // 0
+    F8_F8_BF16,     // 1
+};
+
+#define OP_NAME "gemm_universal_batched"
+#define OP_DESC "Batched GEMM Universal"
+
+int profile_batched_gemm_universal(int argc, char* argv[])
+{
+    if(argc != 18 && argc != 21)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: bf16, 1: fp8->bf16)\n");
+        printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
+        printf("                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        printf("optional:\n");
+        printf("arg18: number of warm-up cycles (default 1)\n");
+        printf("arg19: number of iterations (default 10)\n");
+        printf("arg20: memory for rotating buffer (default 0, size in MB)\n");
+        // clang-format on
+        exit(1);
+    }
+
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 21)
+    {
+        n_warmup = std::stoi(argv[18]);
+        n_iter   = std::stoi(argv[19]);
+        rotating = std::stoull(argv[20]) * 1024 * 1024;
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    const int BatchStrideA = std::stoi(argv[14]);
+    const int BatchStrideB = std::stoi(argv[15]);
+    const int BatchStrideC = std::stoi(argv[16]);
+
+    const int BatchCount = std::stoi(argv[17]);
+
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    using F8 = ck::f8_t;
+#endif
+    using BF16 = ck::bhalf_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType  = decltype(a_type);
+            using BDataType  = decltype(b_type);
+            using DsDataType = ck::Tuple<>;
+            using CDataType  = decltype(c_type);
+
+            using ALayout  = decltype(a_layout);
+            using BLayout  = decltype(b_layout);
+            using DsLayout = ck::Tuple<>;
+            using CLayout  = decltype(c_layout);
+
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmV2MultiD<ALayout,
+                                                                                     BLayout,
+                                                                                     DsLayout,
+                                                                                     CLayout,
+                                                                                     ADataType,
+                                                                                     BDataType,
+                                                                                     DsDataType,
+                                                                                     CDataType,
+                                                                                     AElementOp,
+                                                                                     BElementOp,
+                                                                                     CElementOp>;
+
+            bool pass = ck::profiler::profile_gemm_universal_batched_impl<ADataType,
+                                                                          BDataType,
+                                                                          CDataType,
+                                                                          ALayout,
+                                                                          BLayout,
+                                                                          CLayout,
+                                                                          AElementOp,
+                                                                          BElementOp,
+                                                                          CElementOp,
+                                                                          DeviceOp>(do_verification,
+                                                                                    init_method,
+                                                                                    do_log,
+                                                                                    time_kernel,
+                                                                                    M,
+                                                                                    N,
+                                                                                    K,
+                                                                                    BatchStrideA_,
+                                                                                    BatchStrideB_,
+                                                                                    BatchStrideC_,
+                                                                                    StrideA_,
+                                                                                    StrideB_,
+                                                                                    StrideC_,
+                                                                                    BatchCount,
+                                                                                    n_warmup,
+                                                                                    n_iter,
+                                                                                    rotating);
+
+            return pass ? 0 : 1;
+        };
+
+    if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, BF16{}, Row{}, Col{}, Row{});
+    }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, BF16{}, Row{}, Col{}, Row{});
+    }
+#endif
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_universal);
-- 
GitLab


From 8aba2724cc9a3bc9ddaa7e26055169e014f8dab7 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 18 Nov 2024 14:07:04 -0800
Subject: [PATCH 062/153] Add bf16 and int8 wmma gemms for Navi3x and Navi4x.
 (#1671)

* add bf16 gemms for gfx11/gfx12

* reduce the input values in test_gemm

* add int8 wmma gemm instances for gfx11/gfx12

* add example gemm_wmma_int8

* fix bug in gemm_wmma_int8 test

* increase bf16 gemm test tolerance

* update the dates and clean-up commented-out instances
---
 example/01_gemm/CMakeLists.txt                |  4 +
 example/01_gemm/gemm_wmma_bf16.cpp            | 84 +++++++++++++++++++
 example/01_gemm/gemm_wmma_int8.cpp            | 84 +++++++++++++++++++
 include/ck/utility/amd_wmma.hpp               | 11 +--
 .../tensor_operation_instance/gpu/gemm.hpp    | 52 ++++++++++++
 .../gpu/gemm_wmma.inc                         | 40 +++++++++
 .../include/ck/library/utility/check_err.hpp  |  2 +-
 .../gpu/gemm/CMakeLists.txt                   | 33 +++-----
 ..._wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp | 77 +++++++++++++++++
 ..._wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp | 77 +++++++++++++++++
 ..._wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp | 77 +++++++++++++++++
 ..._wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp | 77 +++++++++++++++++
 ..._wmma_int8_int8_int8_km_kn_mn_instance.cpp | 76 +++++++++++++++++
 ..._wmma_int8_int8_int8_km_nk_mn_instance.cpp | 76 +++++++++++++++++
 ..._wmma_int8_int8_int8_mk_kn_mn_instance.cpp | 76 +++++++++++++++++
 ..._wmma_int8_int8_int8_mk_nk_mn_instance.cpp | 76 +++++++++++++++++
 16 files changed, 896 insertions(+), 26 deletions(-)
 create mode 100644 example/01_gemm/gemm_wmma_bf16.cpp
 create mode 100644 example/01_gemm/gemm_wmma_int8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp

diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 98fd9c6b7..52c8ab580 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -83,3 +83,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
 add_custom_target(example_gemm_wmma)
 add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
+add_example_executable(example_gemm_wmma_bf16 gemm_wmma_bf16.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16)
+add_example_executable(example_gemm_wmma_int8 gemm_wmma_int8.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_int8)
diff --git a/example/01_gemm/gemm_wmma_bf16.cpp b/example/01_gemm/gemm_wmma_bf16.cpp
new file mode 100644
index 000000000..a87426094
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_bf16.cpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using CDataType        = ck::bhalf_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+         < ALayout,
+           BLayout,
+           CLayout,
+           ADataType,
+           BDataType,
+           CDataType,
+           AccDataType,
+           CShuffleDataType,
+           AElementOp,
+           BElementOp,
+           CElementOp,
+           GemmDefault,
+           1,           // Prefetch stage
+           128,         // BlockSize
+           64,          // MPerBlock
+           128,         // NPerBlock
+           64,          // KPerBlock
+           2,           // K1
+           16,          // MPerWmma
+           16,          // NPerWmma
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           1,           // C shuffle (M Repeat) Per store
+           1,           // C shuffle (N Repeat) Per store
+           S<1, 32, 1,  4>,
+           8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_wmma_int8.cpp b/example/01_gemm/gemm_wmma_int8.cpp
new file mode 100644
index 000000000..a88e42d42
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_int8.cpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+
+using ADataType        = int8_t;
+using BDataType        = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using CDataType        = int8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+         < ALayout,
+           BLayout,
+           CLayout,
+           ADataType,
+           BDataType,
+           CDataType,
+           AccDataType,
+           CShuffleDataType,
+           AElementOp,
+           BElementOp,
+           CElementOp,
+           GemmDefault,
+           1,           // Prefetch stage
+           128,         // BlockSize
+           64,          // MPerBlock
+           128,         // NPerBlock
+           64,          // KPerBlock
+           2,           // K1
+           16,          // MPerWmma
+           16,          // NPerWmma
+           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           S<4, 32, 1>,
+           S<1, 0, 2>,
+           S<1, 0, 2>,
+           2,
+           2,
+           2,
+           true,
+           1,           // C shuffle (M Repeat) Per store
+           1,           // C shuffle (N Repeat) Per store
+           S<1, 32, 1,  4>,
+           8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
index d04513f3e..aa519fb2b 100644
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -13,6 +13,11 @@ namespace ck {
     defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
+
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
+#define __gfx12__
+#endif
+
 /********************************WAVE32 MODE***********************************************/
 
 // src: fp16, dst: fp32
@@ -99,7 +104,7 @@ struct intrin_wmma_bf16_16x16x16_bf16_w32<16, 16, Opsel>
         // opsel usage
         // false: D0.[0:15] = result
         // true : D0.[16:31]= result
-#if defined(__gfx11__)
+#if defined(__gfx11__) || defined(__gfx12__)
         reg_c.template AsType<bhalf16_t>()(Number<0>{}) =
             __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(
                 reg_a, reg_b, reg_c.template AsType<bhalf16_t>()[Number<0>{}], Opsel);
@@ -261,10 +266,6 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp>
 // gfx12
 /********************************WAVE32 MODE***********************************************/
 
-#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
-#define __gfx12__
-#endif
-
 // src: fp16, dst: fp32
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_wmma_f32_16x16x16_f16_w32_gfx12;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index 50c18fc22..3b3baf697 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -180,6 +180,58 @@ struct DeviceOperationInstanceFactory<
             }
         }
 #endif
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<CDataType, ck::bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+#endif
+#ifdef CK_ENABLE_INT8
+        if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<CDataType, int8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_int8_int8_int8_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_int8_int8_int8_km_nk_mn_instances(op_ptrs);
+            }
+        }
+#endif
 #endif
 
 #ifdef CK_USE_XDL
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc
index c97298c25..c50226335 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_wmma.inc
@@ -28,6 +28,46 @@ void add_device_gemm_wmma_f16_f16_f16_mk_nk_mn_instances(
         DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_int8_int8_int8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_int8_int8_int8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp
index 88741c3b9..08bfefb87 100644
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -206,7 +206,7 @@ typename std::enable_if<
 check_err(const Range& out,
           const RefRange& ref,
           const std::string& msg = "Error: Incorrect results!",
-          double rtol            = 1e-3,
+          double rtol            = 1e-1,
           double atol            = 1e-3)
 {
     if(out.size() != ref.size())
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index e4efae617..b8ecb4557 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -2,9 +2,7 @@ set(GEMM_INSTANCES)
 list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
     device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
     device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
-    device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp)
-
-list(APPEND GEMM_INSTANCES
+    device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
     device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
     device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
     device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -21,9 +19,6 @@ list(APPEND GEMM_INSTANCES
     device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
     device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
     device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
-    )
-
-list(APPEND GEMM_INSTANCES
     device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
     device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
     device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -78,9 +73,6 @@ list(APPEND GEMM_INSTANCES
     device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp
     device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp
     device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
-    )
-
-list(APPEND GEMM_INSTANCES
     device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
     device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
     device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -92,15 +84,11 @@ list(APPEND GEMM_INSTANCES
     device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
     device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
     device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
-    device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp)
-
-list(APPEND GEMM_INSTANCES
+    device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
     device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
     device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
     device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
-    device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp)
-
-list(APPEND GEMM_INSTANCES
+    device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_default_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_interwave_default_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_default_instance.cpp
@@ -109,14 +97,19 @@ list(APPEND GEMM_INSTANCES
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_padded_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
-    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
-
-
-list(APPEND GEMM_INSTANCES
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp
     device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp
     device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp
     device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp
-    device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp)
+    device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp
+    device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp
+    device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp
+    device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp
+    device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp
+    device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp
+    device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp
+    device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp
+    device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp)
 
 add_instance_library(device_gemm_instance ${GEMM_INSTANCES})
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp
new file mode 100644
index 000000000..7a952c44d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|          GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|              |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |              |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp
new file mode 100644
index 000000000..f0dbee5f5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|          GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|              |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |              |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp
new file mode 100644
index 000000000..3db41222a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|          GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|              |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |              |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp
new file mode 100644
index 000000000..ee25b8f6d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |               |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_bf16_bf16_bf16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp
new file mode 100644
index 000000000..dc763afa0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_kn_mn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I8  = int8_t;
+using I32 = int32_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_wmma_int8_int8_int8_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|          GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|              |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |              |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_int8_int8_int8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, I8, I8, I8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp
new file mode 100644
index 000000000..ec4541ed7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_km_nk_mn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I8  = int8_t;
+using I32 = int32_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_wmma_int8_int8_int8_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|          GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|              |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |              |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Col,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_int8_int8_int8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, I8, I8, I8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp
new file mode 100644
index 000000000..a2166bdbc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_kn_mn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I8  = int8_t;
+using I32 = int32_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|          GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|              |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |              |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Row,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, I8, I8, I8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp
new file mode 100644
index 000000000..187a9c772
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_int8_int8_int8_mk_nk_mn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I8  = int8_t;
+using I32 = int32_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumPrefetch| Block|  MPer|  NPer|  KPer| K1| MPer| NPer|      M|       N|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|
+        //######################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|            |  Size| Block| Block| Block|   | WMMA| WMMA| Repeat|  Repeat|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|  MRepeat|  MRepeat|        ClusterLengths|       ScalarPerVector|
+        //######################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |            |      |      |      |      |   |     |     |       |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerStore| PerStore|      MBlock_MPerBlock|                      |
+        //######################|        |        |        |      |      |      |        |         |            |            |            |               |            |      |      |      |      |   |     |     |       |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |         |         |      NBlock_NPerBlock|                      |
+        /* Prefetch 2, consume enormous vgpr resource*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   256,   128,   128,    32,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,   128,   128,    64,    64,  8,   16,   16,      4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    64,    64,    32,    32,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           2,    32,    16,    16,    32,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/
+        // 8 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   256,    64,  8,   16,   16,      4,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   128,    64,  8,   16,   16,      4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  8>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   256,   128,   160,    64,  8,   16,   16,      2,       5,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  4>,                      8>,
+        // 4 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   128,   128,    32,  8,   16,   16,      4,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,   256,    64,    64,  8,   16,   16,      8,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,   256,    64,  8,   16,   16,      2,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 32, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,   128,    64,    80,    64,  8,   16,   16,      1,       5,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 64, 1,  2>,                      8>,
+        // 2 Waves
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    16,    64,    64,  8,   16,   16,      1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    64,    32,    64,  8,   16,   16,      4,       1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    64,    32,    64,    64,  8,   16,   16,      2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  4>,                      8>,
+        // 1 Wave
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    32,    64,  8,   16,   16,      1,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>,
+        DeviceGemmWmma_CShuffle<      Row,     Col,     Row,    I8,    I8,    I8,     I32,       I8, PassThrough, PassThrough, PassThrough, GemmMNKPadding,           1,    32,    16,    16,    64,  8,   16,   16,      1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,        1,        1,       S<1, 16, 1,  2>,                      8>
+    // clang-format on
+    >;
+
+void add_device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, I8, I8, I8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_wmma_int8_int8_int8_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
-- 
GitLab


From e4dfe4d892bfba901204b4975a478d4cce21e5a5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 18 Nov 2024 22:00:18 -0800
Subject: [PATCH 063/153] Bump rocm-docs-core from 1.8.4 to 1.8.5 in
 /docs/sphinx (#1674)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.4 to 1.8.5.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.8.5/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.4...v1.8.5)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 9824df626..3a2e266ef 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.8.4
+rocm-docs-core==1.8.5
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index f89fbcf27..b65d2391f 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.8.4
+rocm-docs-core==1.8.5
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From da0c21f6610e4fa98cf7719e3f92410ffafc963f Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:00:17 -0800
Subject: [PATCH 064/153] add more fp32 dl gemm instances (#1675)

* add more fp32 dl gemm instances

* update the dates
---
 ..._gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp | 41 ++++++++++++++++---
 ..._gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp | 41 ++++++++++++++++---
 ..._gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp | 41 ++++++++++++++++---
 ..._gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp | 41 ++++++++++++++++---
 4 files changed, 140 insertions(+), 24 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
index e696bfdcd..038234111 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -14,15 +14,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F16 = ck::half_t;
 using F32 = float;
-
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
+using S           = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -34,7 +31,39 @@ using device_gemm_dl_f32_f32_f32_km_kn_mn_instances = std::tuple<
         //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                 |                   |
         //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                 |                   |
-        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<16, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,     S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
index d3ad7c60e..f61ae84ba 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -14,15 +14,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F16 = ck::half_t;
 using F32 = float;
-
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
+using S           = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -35,7 +32,39 @@ using device_gemm_dl_f32_f32_f32_km_nk_mn_instances =
         //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
index a56a36b0a..2aeaed1fe 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -14,15 +14,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F16 = ck::half_t;
 using F32 = float;
-
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
+using S           = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -35,7 +32,39 @@ using device_gemm_dl_f32_f32_f32_mk_kn_mn_instances =
         //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 8, 2>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
index 63d55e81d..ff3394d83 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -14,15 +14,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F16 = ck::half_t;
 using F32 = float;
-
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
+using S           = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -35,7 +32,39 @@ using device_gemm_dl_f32_f32_f32_mk_nk_mn_instances =
         //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
         //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
         //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
-        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // MPerBlock=128, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,    64,    16,  2,          4,          4,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<4, 2>,       S<8, 2>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,   128,    16,  2,          4,          4,      1,       S<2, 4>,       S<2, 8>,      S<8, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 2, 2>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  2,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 2, 2>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<1, 4>,       S<1, 4>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
         // clang-format on
         >;
 
-- 
GitLab


From 81ec5eff4a3cb64c6681043593862016193797d1 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Wed, 20 Nov 2024 23:03:56 +0800
Subject: [PATCH 065/153] fix bug (#1680)

---
 .../device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp | 4 ++--
 .../device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp         | 4 ++--
 .../device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp         | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
index b1b64ca85..9555dffd2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -41,7 +41,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances = std
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -69,7 +69,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std:
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|       Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |                |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
index 658714d35..8666cf858 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -40,7 +40,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#ifdef __gfx94__
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         //Only enable these instances on gfx94x
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -67,7 +67,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple<
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index 382ed5b5a..f5e801c16 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -40,7 +40,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -68,7 +68,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-- 
GitLab


From d31e8249c1be17aaada2a8e29df1c6495dc709f4 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 20 Nov 2024 14:01:04 -0800
Subject: [PATCH 066/153] Optimize docker file. (#1679)

* reduce the docker image size and layers

* clean up docker file

* fix linker error for client example 24

* install CK into the default /opt/rocm/ path

* restore installing CK to alternative path in CI

* add linking for utility lib
---
 Dockerfile                                    | 91 +++++++------------
 .../24_grouped_conv_activation/CMakeLists.txt |  4 +-
 client_example/CMakeLists.txt                 |  2 +-
 3 files changed, 35 insertions(+), 62 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 791d1d9f3..b06726335 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,18 +4,14 @@ ARG ROCMVERSION=6.2
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
-
-RUN set -xe
-
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
-RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
-# Add rocm repository
-RUN chmod 1777 /tmp
-RUN apt-get update
-RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
-
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
-RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+
+# Add rocm repository
+RUN set -xe && \
+    useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \
+    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
+    curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
 RUN if [ "$ROCMVERSION" != "6.3" ]; then \
         sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb  --no-check-certificate" && \
@@ -30,8 +26,8 @@ RUN if [ "$ROCMVERSION" != "6.3" ]; then \
         amdgpu-repo --amdgpu-build=2074281; \
     fi
 
-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
-RUN amdgpu-install -y --usecase=rocm --no-dkms
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
+    amdgpu-install -y --usecase=rocm --no-dkms
 
 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
 ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -76,66 +72,49 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     clang-format-12 \
     kmod && \
     apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    rm -rf amdgpu-install* && \
+# Remove unnecessary rocm components that take a lot of space
+    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev
 
 # hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
 RUN if [ "$ROCMVERSION" = "6.1" ]; then \
         sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \
     fi
 # Update the cmake to version 3.27.5
-RUN pip install --upgrade cmake==3.27.5
-
+RUN pip install --upgrade cmake==3.27.5 && \
 #Install latest ccache
-RUN git clone https://github.com/ccache/ccache.git && \
-    cd ccache && mkdir build && cd build && cmake .. && make install
-
+    git clone https://github.com/ccache/ccache.git && \
+    cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
-RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
-RUN gunzip /usr/local/bin/ninja.gz
-RUN chmod a+x /usr/local/bin/ninja
-RUN git clone https://github.com/nico/ninjatracing.git
-
+    wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
+    gunzip /usr/local/bin/ninja.gz && \
+    chmod a+x /usr/local/bin/ninja && \
+    git clone https://github.com/nico/ninjatracing.git && \
 #Install latest cppcheck
-RUN git clone https://github.com/danmar/cppcheck.git && \
+    git clone https://github.com/danmar/cppcheck.git && \
     cd cppcheck && mkdir build && cd build && cmake .. && cmake --build .
 WORKDIR /
 
-# Setup ubsan environment to printstacktrace
-RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
-ENV UBSAN_OPTIONS=print_stacktrace=1
-
 # Install an init system
-RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
-RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
-
-ARG PREFIX=/opt/rocm
+RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
+    dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
-RUN pip3 install --upgrade pip
-RUN pip3 install sqlalchemy==1.4.46
-RUN pip3 install pymysql
-RUN pip3 install pandas==2.0.3
-RUN pip3 install setuptools-rust
-RUN pip3 install sshtunnel==0.4.0
-# Setup ubsan environment to printstacktrace
-ENV UBSAN_OPTIONS=print_stacktrace=1
-
-ENV LC_ALL=C.UTF-8
-ENV LANG=C.UTF-8
-RUN groupadd -f render
-
+    pip3 install --upgrade pip && \
+    pip3 install sqlalchemy==1.4.46 pymysql pandas==2.0.3 setuptools-rust sshtunnel==0.4.0 && \
+# Add render group
+    groupadd -f render && \
 # Install the new rocm-cmake version
-RUN git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
-  cd rocm-cmake && mkdir build && cd build && \
-  cmake  .. && cmake --build . && cmake --build . --target install
+    git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
+    cd rocm-cmake && mkdir build && cd build && \
+    cmake  .. && cmake --build . && cmake --build . --target install
 
 WORKDIR /
-
+# Add alternative compilers, if necessary
 ENV compiler_version=$compiler_version
 ENV compiler_commit=$compiler_commit
-RUN sh -c "echo compiler version = '$compiler_version'"
-RUN sh -c "echo compiler commit = '$compiler_commit'"
-
-ARG DISABLE_CACHE=0
+RUN sh -c "echo compiler version = '$compiler_version'" && \
+    sh -c "echo compiler commit = '$compiler_commit'"
 
 RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
@@ -152,9 +131,3 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd
         make -j 8 ; \
     else echo "using the release compiler"; \
     fi
-
-#clean-up the deb package
-RUN sh -c "rm -rf amdgpu-install*"
-
-#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
-#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
diff --git a/client_example/24_grouped_conv_activation/CMakeLists.txt b/client_example/24_grouped_conv_activation/CMakeLists.txt
index dc55250bf..67bbdfec4 100644
--- a/client_example/24_grouped_conv_activation/CMakeLists.txt
+++ b/client_example/24_grouped_conv_activation/CMakeLists.txt
@@ -54,7 +54,7 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
                       PRIVATE composable_kernel::device_conv_operations
                               composable_kernel::device_other_operations
                               composable_kernel::device_reduction_operations
-                              utility)
+                              composable_kernel::utility)
 # Fwd convscale + AMAX
 add_executable(client_conv3d_fwd_convscale_amax_fp8
                grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
@@ -62,7 +62,7 @@ target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
                       PRIVATE composable_kernel::device_conv_operations
                               composable_kernel::device_other_operations
                               composable_kernel::device_reduction_operations
-                              utility)
+                              composable_kernel::utility)
 # Fwd convscale
 add_executable(client_conv3d_fwd_convscale_fp8
                grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp)
diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
index acb57d7bb..c393972b4 100644
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -62,7 +62,7 @@ else()
     set(CK_USE_WMMA "ON")
 endif()
 
-find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations  device_reduction_operations)
+find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations  device_reduction_operations utility)
 if(GPU_TARGETS MATCHES "gfx9")
     find_package(composable_kernel COMPONENTS device_contraction_operations)
 endif()
-- 
GitLab


From 6916d8cc033543d1ea2028215d75409e11813dd9 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Thu, 21 Nov 2024 14:49:13 +0800
Subject: [PATCH 067/153] Add QianFeng to code owners (#1682)

---
 .github/CODEOWNERS | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 459315e58..5340be274 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
 # Documentation files
-docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
-*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
-*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
-.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
+*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
+*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
+.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
+library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
-- 
GitLab


From fb1ccfa9df534c8c9f351dd959a0ff692d6f9210 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Thu, 21 Nov 2024 14:53:10 +0800
Subject: [PATCH 068/153] [CK_TILE] Add paged-kvcache support in group mode
 fmha fwd splitkv kernels (#1678)

* Generate group mode paged-attn kernel

* Enable paged-kvcache + group mode support

* Add missing header: fused_moe.hpp

* Add comment to explain kernel arg usage

* Make error message more clear

* Add comment for confusing data member names

* Add more comment for confusing variable names

* Fix typo in option description
---
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |  3 -
 example/ck_tile/01_fmha/fmha_fwd.cpp          | 59 ++++++++++++-------
 example/ck_tile/01_fmha/fmha_fwd.hpp          | 10 +++-
 example/ck_tile/01_fmha/utils.hpp             |  4 +-
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   | 49 ++++++++++-----
 include/ck_tile/ops/fused_moe.hpp             | 11 ++++
 6 files changed, 94 insertions(+), 42 deletions(-)
 create mode 100644 include/ck_tile/ops/fused_moe.hpp

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index b084e9d0f..d1da95156 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -655,9 +655,6 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                         continue
-                    if pipeline.F_pagedkv == 't':
-                        # we only use batch mode kernels to handle (paged-) kvcache problems
-                        continue
                 k = Kernel(F_idx=0,
                            F_hdim=hdim,
                            F_dtype=dtype,
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 14291715f..00e0a1653 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -62,7 +62,7 @@ auto create_args(int argc, char* argv[])
                 "-1 to choose s_knew in [1, s] randomly.")
         .insert("s_kpad",
                 "-1",
-                "seqlen_k stride between 2 tokens, currently used in group-mode only\n"
+                "seqlen_k stride between 2 batches, currently used in group-mode only\n"
                 "for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride\n"
                 "along seqlen, instead of packed. same as xformer kv_padding")
         .insert("d", "128", "head dim for q, k")
@@ -294,7 +294,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
 #if !CK_TILE_FMHA_FWD_APPENDKV_API
     if(seqlen_knew != 0)
     {
-        std::cerr << "kvcache is not supported. ignoring the 's_knew' option" << std::endl;
+        std::cerr << "fmha_fwd_appendkv() is not enabled. ignoring the 's_knew' option"
+                  << std::endl;
         seqlen_knew = 0;
     }
 #endif
@@ -321,6 +322,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
         rotary_dim = 0;
     }
 #endif
+    // to use fmha_fwd_appendkv(), make sure it's in batch mode
+    const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim);
+    if(need_append_kvcache && mode == mode_enum::group)
+    {
+        std::cerr << "fmha_fwd_appendkv() will be invoked. ignoring the 'mode' option" << std::endl;
+        mode = mode_enum::batch;
+    }
     if(!(rotary_dim <= hdim_q))
     {
         std::cerr << "rotary_dim should be less than or equal to head dim for q" << std::endl;
@@ -356,22 +364,26 @@ bool run(const ck_tile::ArgParser& arg_parser)
                   << std::endl;
         use_cache_batch_idx = false;
     }
-#endif
-    if(0 < page_block_size && use_cache_batch_idx)
+#else
+    if(use_cache_batch_idx)
     {
-        std::cerr << "paged-kvcache does not support cache_batch_idx. ignoring the "
-                     "'cache_batch_idx' option"
-                  << std::endl;
-        use_cache_batch_idx = false;
+        if(0 < page_block_size)
+        {
+            std::cerr << "paged-kvcache does not support cache_batch_idx. ignoring the "
+                         "'cache_batch_idx' option"
+                      << std::endl;
+            use_cache_batch_idx = false;
+        }
+        else if(mode == mode_enum::group)
+        {
+            std::cerr << "group mode will not use cache_batch_idx. ignoring the "
+                         "'cache_batch_idx' option"
+                      << std::endl;
+            use_cache_batch_idx = false;
+        }
     }
-    // the input tensor layout for kvcache is same as batch mode
-    const bool need_append_kvcache = (0 < seqlen_knew || 0 < rotary_dim);
+#endif
     const bool use_kvcache = (need_append_kvcache || use_cache_batch_idx || 0 < page_block_size);
-    if(use_kvcache && mode != mode_enum::batch)
-    {
-        std::cerr << "kvcache enabled. ignoring the 'mode' option" << std::endl;
-        mode = mode_enum::batch;
-    }
 
     auto [seqlen_qs, seqlen_ks, seqlen_kpads] =
         decode_seqlen(mode,
@@ -380,7 +392,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                       arg_parser.get_str("s_k"),
                       arg_parser.get_str("s_kpad"),
                       /*seqlen_k_min=*/0 < seqlen_knew ? seqlen_knew : 0,
-                      use_kvcache);
+                      need_append_kvcache);
     // compute kvcache seqlen_k (before appending knew/vnew)
     auto cache_seqlen_ks = seqlen_ks;
     std::transform(cache_seqlen_ks.begin(),
@@ -741,8 +753,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
     ck_tile::DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
-    ck_tile::DeviceMem seqlen_k_buf(
-        use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.size() * sizeof(int32_t) : 0);
+    ck_tile::DeviceMem seqlen_k_buf((mode == mode_enum::batch && use_kvcache) ||
+                                            0 <= seqlen_kpads[0]
+                                        ? seqlen_ks.size() * sizeof(int32_t)
+                                        : 0);
     ck_tile::DeviceMem cache_seqlen_k_buf(
         need_append_kvcache ? cache_seqlen_ks.size() * sizeof(int32_t) : 0);
     ck_tile::DeviceMem rotary_cos_buf(rotary_cos_host.get_element_space_size_in_bytes());
@@ -763,7 +777,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
     seqstart_q.ToDevice(seqstart_q_host.data());
     seqstart_k.ToDevice(seqlen_kpads[0] < 0 ? seqstart_k_host.data()
                                             : seqstart_k_with_padding_host.data());
-    seqlen_k_buf.ToDevice(use_kvcache || 0 <= seqlen_kpads[0] ? seqlen_ks.data() : nullptr);
+    seqlen_k_buf.ToDevice((mode == mode_enum::batch && use_kvcache) || 0 <= seqlen_kpads[0]
+                              ? seqlen_ks.data()
+                              : nullptr);
     cache_seqlen_k_buf.ToDevice(need_append_kvcache ? cache_seqlen_ks.data() : nullptr);
     rotary_cos_buf.ToDevice(rotary_cos_host.data());
     rotary_sin_buf.ToDevice(rotary_sin_host.data());
@@ -976,8 +992,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 (mode == mode_enum::group ? seqstart_q.GetDeviceBuffer() : nullptr);
             args.seqstart_k_ptr =
                 (mode == mode_enum::group ? seqstart_k.GetDeviceBuffer() : nullptr);
-            args.seqlen_k_ptr =
-                (use_kvcache || 0 <= k_paddings_[0] ? seqlen_k_buf.GetDeviceBuffer() : nullptr);
+            args.seqlen_k_ptr = ((mode == mode_enum::batch && use_kvcache) || 0 <= k_paddings_[0]
+                                     ? seqlen_k_buf.GetDeviceBuffer()
+                                     : nullptr);
 
             args.seqlen_k     = shape_seqlen_k; // unused in group mode (or kvcache enabled)
             args.max_seqlen_q = max_seqlen_q;
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 251e61bc7..41edac67b 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -173,8 +173,11 @@ struct fmha_fwd_splitkv_args
     //             seqlen_k = kargs.seqlen_k
     // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
     //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
-    // kvcache mode (use same kernel as batch mode):
+    // batch mode (kvcache):
     //             seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    // group mode (kvcache):
+    //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
     //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
     const void* seqstart_q_ptr;
     const void* seqstart_k_ptr;
@@ -251,7 +254,7 @@ struct fmha_fwd_appendkv_args
     ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
     ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
 
-    const void* cache_batch_idx;
+    const void* cache_batch_idx; // only used if block_table_ptr is nullptr -> batch mode (kvcache)
 
     ck_tile::index_t stride_q;
     ck_tile::index_t stride_k;
@@ -389,6 +392,9 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                      args.nhead_q,
                                      args.nhead_q / args.nhead_k,
                                      args.num_splits,
+                                     args.block_table_ptr,
+                                     args.batch_stride_block_table,
+                                     args.page_block_size,
                                      args.scale_s,
                                      args.scale_p,
                                      args.stride_q,
diff --git a/example/ck_tile/01_fmha/utils.hpp b/example/ck_tile/01_fmha/utils.hpp
index 996032a71..faf3f0843 100644
--- a/example/ck_tile/01_fmha/utils.hpp
+++ b/example/ck_tile/01_fmha/utils.hpp
@@ -145,7 +145,7 @@ decode_seqlen(mode_enum mode,
               std::string k_val,
               std::string k_pad_val,
               ck_tile::index_t seqlen_k_min = 0,
-              bool use_kvcache              = false,
+              bool need_append_kvcache      = false,
               std::optional<unsigned> seed  = std::nullopt)
 {
 #define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
@@ -159,7 +159,7 @@ decode_seqlen(mode_enum mode,
             const ck_tile::index_t seqlen_k_max = (k < 0 ? q : k);
             std::vector<ck_tile::index_t> seqlen_ks(batch, seqlen_k_max);
 
-            if(1 < batch && use_kvcache)
+            if(1 < batch && need_append_kvcache)
             {
                 // to keep the original s_k value, we always use seqlen_k_max in first batch
                 randints(std::next(seqlen_ks.begin()),
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 4ffebc3c9..98a4329d7 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -46,8 +46,7 @@ struct FmhaFwdSplitKVKernel
     static constexpr auto BiasEnum          = FmhaPipeline::BiasEnum;
     static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
     static constexpr bool kIsPagedKV        = FmhaPipeline::Problem::kIsPagedKV;
-    static_assert(!kIsGroupMode || (kIsGroupMode && !kIsPagedKV),
-                  "paged-kvcache only supported by batch mode kernels");
+
     using FmhaMask                 = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     static constexpr bool kHasMask = FmhaMask::IsMasking;
 
@@ -198,8 +197,10 @@ struct FmhaFwdSplitKVKernel
         const int32_t* seqlen_k_ptr;
 
         ck_tile::index_t batch_stride_q;
-        ck_tile::index_t batch_stride_k;
-        ck_tile::index_t batch_stride_v;
+        ck_tile::index_t batch_stride_k; // when using paged-kvcache, this will be stride/size for
+                                         // single kcache page-block
+        ck_tile::index_t batch_stride_v; // when using paged-kvcache, this will be stride/size for
+                                         // single vcache page-block
         ck_tile::index_t batch_stride_lse_acc;
         ck_tile::index_t batch_stride_o_acc;
     };
@@ -212,14 +213,17 @@ struct FmhaFwdSplitKVKernel
                                                 AlibiKargs,
                                                 EmptyKargs<0>>>,
           std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>,
-          std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>
+          std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>,
+          std::conditional_t<kIsPagedKV, PageBlockTableKargs, EmptyKargs<3>>
     {
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
         const int32_t* seqlen_k_ptr;
 
-        ck_tile::index_t batch_stride_k; // only used for paged-kvcache
-        ck_tile::index_t batch_stride_v; // only used for paged-kvcache
+        ck_tile::index_t batch_stride_k; // only used for paged-kvcache, this will be stride/size
+                                         // for single kcache page-block
+        ck_tile::index_t batch_stride_v; // only used for paged-kvcache, this will be stride/size
+                                         // for single vcache page-block
     };
 
     using Kargs = std::conditional_t<kIsGroupMode, GroupModeKargs, BatchModeKargs>;
@@ -363,6 +367,9 @@ struct FmhaFwdSplitKVKernel
               ck_tile::index_t num_head_q,
               ck_tile::index_t nhead_ratio_qk,
               ck_tile::index_t num_splits,
+              const void* block_table_ptr,
+              ck_tile::index_t batch_stride_block_table,
+              ck_tile::index_t page_block_size,
               float scale_s,
               float scale_p,
               ck_tile::index_t stride_q,
@@ -416,6 +423,7 @@ struct FmhaFwdSplitKVKernel
                     {},                   // placeholder for bias
                     {},                   // placeholder for mask
                     {},                   // placeholder for fp8_static_quant args
+                    {},                   // placeholder for paged-block table
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqstart_k_ptr),
                     reinterpret_cast<const int32_t*>(seqlen_k_ptr),
@@ -443,6 +451,12 @@ struct FmhaFwdSplitKVKernel
         {
             kargs.scale_p = scale_p;
         }
+        if constexpr(kIsPagedKV)
+        {
+            kargs.block_table_ptr          = reinterpret_cast<const int32_t*>(block_table_ptr);
+            kargs.batch_stride_block_table = batch_stride_block_table;
+            kargs.page_block_size          = page_block_size;
+        }
 
         return kargs;
     }
@@ -489,15 +503,22 @@ struct FmhaFwdSplitKVKernel
             const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
 
             batch_offset_q = query_start * kargs.stride_q;
-            batch_offset_k = key_start * kargs.stride_k;
-
-            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            if constexpr(kIsPagedKV)
             {
-                batch_offset_v = key_start * kargs.stride_v;
+                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
             }
             else
             {
-                batch_offset_v = key_start;
+                batch_offset_k = key_start * kargs.stride_k;
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    batch_offset_v = key_start * kargs.stride_v;
+                }
+                else
+                {
+                    batch_offset_v = key_start;
+                }
             }
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
@@ -685,7 +706,7 @@ struct FmhaFwdSplitKVKernel
 
                 return make_page_block_navigator<const KDataType, 0>(
                     kargs.k_ptr,
-                    kargs.batch_stride_k,
+                    kargs.batch_stride_k, // kcache page-block stride/size
                     fixed_offset,
                     block_indices,
                     num_blocks,
@@ -715,7 +736,7 @@ struct FmhaFwdSplitKVKernel
 
                 return make_page_block_navigator<const VDataType, 1>(
                     kargs.v_ptr,
-                    kargs.batch_stride_v,
+                    kargs.batch_stride_v, // vcache page-block stride/size
                     fixed_offset,
                     block_indices,
                     num_blocks,
diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp
new file mode 100644
index 000000000..b74607f06
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe.hpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
-- 
GitLab


From d6d4c2788bc66c7ead56f1d7b03b7c7b28c2b007 Mon Sep 17 00:00:00 2001
From: Harisankar Sadasivan <135730918+hsadasiv@users.noreply.github.com>
Date: Thu, 21 Nov 2024 08:21:37 -0800
Subject: [PATCH 069/153] universal streamk fp8 changes (#1665)

* universal streamk fp8 changes & ckprofiler instances

* revert strides to -1 and verification options

* fp8 exclusion on pre-gfx94 for universal_streamk

* PR review based revisions: permissions reverted,  removed hip err checks


---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 README.md                                     |   3 +-
 example/01_gemm/CMakeLists.txt                |   3 +
 example/01_gemm/common.hpp                    |   2 +-
 example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp  |  13 +-
 example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp   |  58 ++
 .../01_gemm/run_gemm_example_streamk_v2.inc   |  40 +
 .../device_gemm_xdl_cshuffle_streamk_v3.hpp   | 382 ++++++--
 .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 818 ++++++++++++++++--
 .../gpu/gemm_universal_streamk.hpp            | 315 +++++++
 .../gpu/CMakeLists.txt                        |   6 +
 .../gpu/gemm_universal_streamk/CMakeLists.txt |  45 +-
 ..._universal_streamk_f16_f8_f16_mk_kn_mn.hpp |  84 ++
 ..._f8_f16_mk_kn_mn_comp_default_instance.cpp |  24 +
 ...f8_f16_mk_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp |  24 +
 ...8_f16_mk_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...8_f16_mk_kn_mn_mem_v1_default_instance.cpp |  25 +
 ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp |  25 +
 ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp |  25 +
 ...8_f16_mk_kn_mn_mem_v2_default_instance.cpp |  25 +
 ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp |  25 +
 ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp |  25 +
 ..._universal_streamk_f16_f8_f16_mk_nk_mn.hpp |  90 ++
 ..._f8_f16_mk_nk_mn_comp_default_instance.cpp |  24 +
 ...f8_f16_mk_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp |  24 +
 ...8_f16_mk_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...8_f16_mk_nk_mn_mem_v1_default_instance.cpp |  25 +
 ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  25 +
 ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp |  25 +
 ...8_f16_mk_nk_mn_mem_v2_default_instance.cpp |  25 +
 ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  25 +
 ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp |  25 +
 ..._universal_streamk_f8_f16_f16_mk_kn_mn.hpp |  85 ++
 ...f16_f16_mk_kn_mn_comp_default_instance.cpp |  24 +
 ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp |  24 +
 ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...6_f16_mk_kn_mn_mem_v1_default_instance.cpp |  25 +
 ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp |  25 +
 ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp |  25 +
 ...6_f16_mk_kn_mn_mem_v2_default_instance.cpp |  25 +
 ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp |  25 +
 ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp |  25 +
 ..._universal_streamk_f8_f16_f16_mk_nk_mn.hpp |  90 ++
 ...f16_f16_mk_nk_mn_comp_default_instance.cpp |  24 +
 ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp |  24 +
 ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...6_f16_mk_nk_mn_mem_v1_default_instance.cpp |  25 +
 ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  25 +
 ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp |  25 +
 ...6_f16_mk_nk_mn_mem_v2_default_instance.cpp |  25 +
 ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  25 +
 ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp |  25 +
 .../gemm_universal_streamk/CMakeLists.txt     |  26 -
 ...universal_streamk_f16_f16_f16_mk_kn_mn.hpp |  91 --
 ...f16_f16_mk_kn_mn_comp_default_instance.cpp |  30 -
 ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp |  30 -
 ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp |  30 -
 ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp |  30 -
 ...6_f16_mk_kn_mn_mem_v1_default_instance.cpp |  31 -
 ..._f16_mk_kn_mn_mem_v1_kpadding_instance.cpp |  31 -
 ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp |  31 -
 ...6_f16_mk_kn_mn_mem_v2_default_instance.cpp |  31 -
 ..._f16_mk_kn_mn_mem_v2_kpadding_instance.cpp |  31 -
 ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp |  31 -
 ...universal_streamk_f16_f16_f16_mk_nk_mn.hpp |  98 ---
 ...f16_f16_mk_nk_mn_comp_default_instance.cpp |  30 -
 ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp |  30 -
 ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp |  30 -
 ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp |  30 -
 ...6_f16_mk_nk_mn_mem_v1_default_instance.cpp |  31 -
 ..._f16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  31 -
 ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp |  31 -
 ...6_f16_mk_nk_mn_mem_v2_default_instance.cpp |  31 -
 ..._f16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  31 -
 ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp |  31 -
 modified_files.txt                            |  10 +
 .../src/profile_gemm_universal_streamk.cpp    |  24 +-
 80 files changed, 2887 insertions(+), 992 deletions(-)
 create mode 100755 example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
 mode change 100644 => 100755 example/01_gemm/run_gemm_example_streamk_v2.inc
 mode change 100644 => 100755 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
 mode change 100644 => 100755 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
 create mode 100755 modified_files.txt
 mode change 100644 => 100755 profiler/src/profile_gemm_universal_streamk.cpp

diff --git a/README.md b/README.md
index 302173dc1..d8eb152ee 100644
--- a/README.md
+++ b/README.md
@@ -154,8 +154,7 @@ Additional cmake flags can be used to significantly speed-up the build:
   other platforms have faster instances, such as `xdl` or `wmma`, available.
 
 * `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
-  such as `gemm_universal` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not
-  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
+  such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
   architectures like the MI100/MI200 for the functional support only.
 
 ## Using sccache for building
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 52c8ab580..957acce16 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -77,6 +77,9 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
 add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
 
+add_example_executable(example_gemm_xdl_fp8_streamk_v3 gemm_xdl_fp8_streamk_v3.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_streamk_v3)
+
 add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
 
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index 6e1c9f2a0..67bf92bbb 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -44,7 +44,7 @@ struct ProblemSizeStreamK final
     ck::index_t StrideB = -1;
     ck::index_t StrideC = -1;
 
-    ck::index_t NumSKBlocks = -1;
+    ck::index_t NumSKBlocks = -1; // number of stream-k blocks
 };
 struct ProblemSizeStreamK_universal final
 {
diff --git a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
index 5b163962b..36ac51f1d 100644
--- a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
@@ -8,7 +8,7 @@
 using ADataType        = ck::half_t;
 using BDataType        = ck::half_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
+using CShuffleDataType = float;
 using CDataType        = ck::half_t;
 
 using ALayout = Row;
@@ -43,6 +43,17 @@ using DeviceGemmV2_Streamk_Instance =
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
 #include "run_gemm_example_streamk_v2.inc"
 
 int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
new file mode 100755
index 000000000..3b79ae9b8
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+using ADataType        = ck::f8_t;
+using BDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2_Streamk_Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType,   BDataType,  CDataType,  AccDataType,  CShuffleDataType, 
+        PassThrough, PassThrough, PassThrough, GemmDefault, 
+          256,
+        128, 256, 
+        128, 16, 16,
+        16,   16,
+        4,    8, 
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 16, 16, 1,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 16, 16, 1,
+        1, 2, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+#include "run_gemm_example_streamk_v2.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc
old mode 100644
new mode 100755
index 8ed8b81be..04243b829
--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -176,6 +176,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
     Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
     std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
     std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -196,6 +197,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
     DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
 
     a_m_k_device_buf.ToDevice(a_m_k.mData.data());
     b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -240,6 +243,13 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
+    std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
+    if(workspace_size != 0)
+    {
+        workspace.Realloc(workspace_size);
+        gemm.SetWorkSpacePointer(&argument, workspace.GetDeviceBuffer());
+    }
+
     bool pass = true;
     if((config.do_verification == 1) || (config.do_verification == 3))
     {
@@ -271,6 +281,36 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #endif
     }
 
+    if((config.do_verification == 2) || (config.do_verification == 3))
+    {
+        // GPU verification
+        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
+        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
+
+        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
+            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        std::cout << "Running verification on GPU." << std::endl;
+        ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
+
+        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_device_ref_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
     if(config.time_kernel)
     {
         ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
old mode 100644
new mode 100755
index 452063156..cfd9a1204
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -131,6 +131,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
     {
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+
             if(stream_config.log_level_ > 0)
             {
                 arg.Print();
@@ -147,26 +148,27 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
             index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
 
             const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
-            hipGetErrorString(hipMemsetAsync(
-                arg.p_c_grid, 0, arg.M * arg.N * sizeof(CDataType), stream_config.stream_id_));
+
+            if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
+                         StreamKReductionStrategy::Atomic)
+            {
+
+                hip_check_error(hipMemsetAsync(
+                    arg.p_c_grid, 0, arg.M * arg.N * sizeof(CDataType), stream_config.stream_id_));
+            }
+
             const auto Run = [&](const auto& kernel) {
                 dim3 grid_dim;
                 if(arg.Grid_size < 0)
                 {
                     int occupancy, num_cu;
-                    hipError_t rtn;
-                    rtn = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-                        &occupancy, kernel, BlockSize, 0);
-                    hip_check_error(rtn);
-
+                    hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                        &occupancy, kernel, BlockSize, 0));
                     hipDeviceProp_t dev_prop;
                     hipDevice_t dev;
-                    rtn = hipGetDevice(&dev);
-                    hip_check_error(rtn);
-                    rtn = hipGetDeviceProperties(&dev_prop, dev);
-                    hip_check_error(rtn);
-                    num_cu = dev_prop.multiProcessorCount;
-
+                    hip_check_error(hipGetDevice(&dev));
+                    hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
+                    num_cu        = dev_prop.multiProcessorCount;
                     arg.Grid_size = num_cu * occupancy;
                     grid_dim      = arg.Grid_size;
                 }
@@ -196,8 +198,31 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                 else
                 {
 
-                    ave_time = launch_and_time_kernel(
-                        stream_config, kernel, grid_dim, dim3(BlockSize), 0, arg);
+                    if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
+                                 StreamKReductionStrategy::Atomic)
+                    {
+                        ave_time = launch_and_time_kernel(
+                            stream_config, kernel, grid_dim, dim3(BlockSize), 0, arg);
+                    }
+                    else if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
+                                      StreamKReductionStrategy::Reduction)
+                    {
+                        char* workspace_semaphore =
+                            reinterpret_cast<char*>(arg.p_workspace_) +
+                            arg.block_2_ctile_map_streamk.get_workspace_size_for_acc(
+                                sizeof(GemmAccDataType));
+                        auto preprocess = [&]() {
+                            hipMemsetAsync(
+                                workspace_semaphore,
+                                0,
+                                // sizeof(uint32_t),
+                                arg.block_2_ctile_map_streamk.get_workspace_size_for_semaphore(),
+                                stream_config.stream_id_);
+                        };
+
+                        ave_time = launch_and_time_kernel_with_preprocess(
+                            stream_config, preprocess, kernel, grid_dim, dim3(BlockSize), 0, arg);
+                    }
                 }
             };
 
@@ -211,14 +236,12 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                              BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
 
-                    {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy>;
-                        Run(kernel);
-                    }
+                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy>;
+
+                    Run(kernel);
                 }
                 // Tail number could be One to Seven
                 else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
@@ -340,53 +363,49 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                 else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
                 {
 
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
                     {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
-                                                                 true,
-                                                                 InMemoryDataOperationEnum::Set,
-                                                                 minimum_occupancy,
-                                                                 TailNumber::Odd>;
-                            Run(kernel);
-                        }
-                        else
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
-                                                                 true,
-                                                                 InMemoryDataOperationEnum::Set,
-                                                                 minimum_occupancy,
-                                                                 TailNumber::Even>;
-                            Run(kernel);
-                        }
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             minimum_occupancy,
+                                                             TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             minimum_occupancy,
+                                                             TailNumber::Even>;
+                        Run(kernel);
                     }
                 }
                 else
                 {
 
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
                     {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::Odd>;
-                            Run(kernel);
-                        }
-                        else
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::Even>;
-                            Run(kernel);
-                        }
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Even>;
+                        Run(kernel);
                     }
                 }
             }
@@ -396,14 +415,11 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
 
-                    {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        false,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy>;
-                        Run(kernel);
-                    }
+                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    false,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy>;
+                    Run(kernel);
                 }
             }
 
@@ -418,6 +434,29 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
         }
     };
 
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* p_arg = dynamic_cast<const Argument*>(pArg);
+        if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
+                     StreamKReductionStrategy::Reduction)
+        {
+            return p_arg->block_2_ctile_map_streamk.get_workspace_size(sizeof(GemmAccDataType));
+        }
+        else
+        {
+            return 0;
+        }
+    }
+
+    void SetWorkSpacePointer(BaseArgument* pArg,
+                             void* p_workspace,
+                             const StreamConfig& = StreamConfig{}) const override
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(pArg);
+
+        pArg_->p_workspace_ = p_workspace;
+    }
+
     static constexpr bool IsValidCompilationParameter()
     {
         // TODO: properly implement this check
@@ -464,8 +503,205 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                              CElementwiseOperation)
     {
 
-        return Argument{
-            p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, streamk_sel, Grid_size}; // HS
+        constexpr index_t minimum_occupancy =
+            BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+        index_t K_split                  = (K + KPerBlock - 1) / KPerBlock * KPerBlock;
+        const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+        int occupancy, num_cu;
+        const auto calculate_grid_size = [&](const auto& kernel) {
+            hip_check_error(
+                hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize, 0));
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            hip_check_error(hipGetDevice(&dev));
+            hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
+            num_cu    = dev_prop.multiProcessorCount;
+            Grid_size = num_cu * occupancy;
+        };
+
+        if(has_main_k_block_loop)
+        {
+            // Tail number always full
+            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                         BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+            {
+
+                const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy>;
+                calculate_grid_size(kernel);
+            }
+            // Tail number could be One to Seven
+            else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+            {
+
+                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                {
+                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::One>;
+                    calculate_grid_size(kernel);
+                }
+                else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full)
+                {
+                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Full>;
+                    calculate_grid_size(kernel);
+                }
+
+                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Two>;
+                        calculate_grid_size(kernel);
+                    }
+                }
+
+                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Three)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Three>;
+                        calculate_grid_size(kernel);
+                    }
+                }
+
+                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Four)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Four>;
+                        calculate_grid_size(kernel);
+                    }
+                }
+
+                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Five)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Five>;
+                        calculate_grid_size(kernel);
+                    }
+                }
+
+                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Six>;
+                        calculate_grid_size(kernel);
+                    }
+                }
+
+                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Seven)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Seven>;
+                        calculate_grid_size(kernel);
+                    }
+                }
+            }
+            // Tail number could be Odd or Even
+            else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+            {
+
+                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                {
+                    const auto kernel =
+                        kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::Set,
+                                                         minimum_occupancy,
+                                                         TailNumber::Odd>;
+                    calculate_grid_size(kernel);
+                }
+                else
+                {
+                    const auto kernel =
+                        kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::Set,
+                                                         minimum_occupancy,
+                                                         TailNumber::Even>;
+                    calculate_grid_size(kernel);
+                }
+            }
+            else
+            {
+
+                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                {
+                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Odd>;
+                    calculate_grid_size(kernel);
+                }
+                else
+                {
+                    const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Even>;
+                    calculate_grid_size(kernel);
+                }
+            }
+        }
+        else
+        {
+            // Tail number always 1
+            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+            {
+
+                const auto kernel = kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                false,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy>;
+                calculate_grid_size(kernel);
+            }
+        }
+
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, streamk_sel, Grid_size};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
old mode 100644
new mode 100755
index ff1021535..6ef35da48
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -14,6 +14,8 @@
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/workgroup_barrier.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
 
 namespace ck {
 
@@ -38,7 +40,7 @@ __global__ void
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg);
+        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg, karg.p_workspace_);
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -62,7 +64,13 @@ __global__ void
     __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg);
+        karg.p_a_grid,
+        karg.p_b_grid,
+        karg.p_c_grid,
+        p_shared_0,
+        p_shared_1,
+        karg,
+        karg.p_workspace_);
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -521,7 +529,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, Streamk_sel_, Grid_size_},
               p_a_grid{p_a_grid_},
               p_b_grid{p_b_grid_},
-              p_c_grid{p_c_grid_}
+              p_c_grid{p_c_grid_},
+              block_2_ctile_map_streamk(
+                  M_, N_, AK0Number * CalculateKPadded(K_, 1), Grid_size_, Streamk_sel_)
 
         {
         }
@@ -529,6 +539,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
         const ADataType* p_a_grid;
         const BDataType* p_b_grid;
         CDataType* p_c_grid;
+        BlockToCTileMap_GemmStreamK_v2<MPerBlock,
+                                       NPerBlock,
+                                       KPerBlock,
+                                       StreamKReductionStrategy::Atomic,
+                                       8,
+                                       4>
+            block_2_ctile_map_streamk;
     };
 
     struct SplitKBatchOffset
@@ -853,6 +870,19 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
         return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
     }
 
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MXdlPerWave / CShuffleMXdlPerWavePerShuffle>{},
+                       Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                       Number<NXdlPerWave / CShuffleNXdlPerWavePerShuffle>{},
+                       Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+    }
+
     using BlockwiseGemmPipe =
         remove_cvref_t<decltype(BlockGemmPipeline_Selector<
                                 BlkGemmPipelineVer,
@@ -1118,6 +1148,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
         return c_grid_desc_mblock_mperblock_nblock_nperblock;
     }
 
+    __host__ __device__ static constexpr auto GetClusterLengthReduction()
+    {
+        // TODO: assume C is row major
+        // TODO: we always first loop over N, then M
+        constexpr auto NPerBlockPow2 = math::next_power_of_two<NPerBlock>();
+        constexpr auto NPerBlockReduction =
+            NPerBlockPow2 / CShuffleBlockTransferScalarPerVector_NPerBlock;
+        constexpr auto MPerBlockReduction =
+            (BlockSize + NPerBlockReduction - 1) / NPerBlockReduction;
+        return Sequence<MPerBlockReduction, NPerBlockReduction>{};
+    }
+
+    __host__ __device__ static constexpr auto GetPartialAccBlockDescriptor()
+    {
+        const auto c_partial_acc_block_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MPerBlock, NPerBlock),
+                                                    make_tuple(NPerBlock, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MPerBlock, NPerBlock),
+                                                    make_tuple(I1, MPerBlock));
+            }
+        }();
+        return c_partial_acc_block_m_n;
+    }
     using Block2CTileMap_streamk = BlockToCTileMap_GemmStreamK_v2<MPerBlock,
                                                                   NPerBlock,
                                                                   KPerBlock,
@@ -1132,22 +1190,42 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                const BDataType* p_b_grid,
                                CDataType* p_c_grid,
                                void* p_shared,
-                               Problem& problem)
+                               Problem& problem,
+                               void* p_workspace)
     {
-
         const AElementwiseOperation a_element_op{};
         const BElementwiseOperation b_element_op{};
         const CElementwiseOperation c_element_op{};
 
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
         Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M,
                                                          problem.N,
                                                          AK0Number * problem.KPadded,
                                                          problem.Grid_size,
                                                          problem.Streamk_sel);
         uint32_t iter_start, iter_end;
-        bool is_sk_block, is_dp_block;
+        bool is_sk_block, is_dp_block, is_reduction_block;
         index_t num_k_block_main_loop;
-
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        uint32_t* p_semaphore = reinterpret_cast<uint32_t*>(
+            reinterpret_cast<char*>(p_workspace) +
+            block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType)));
         for(auto block_idx = get_block_1d_id();
             block_idx < block_2_ctile_map_streamk.get_grid_dims();
             block_idx += gridDim.x)
@@ -1163,6 +1241,214 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
             num_k_block_main_loop = iter_end - iter_start;
 
+            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                         StreamKReductionStrategy::Reduction)
+            {
+                is_reduction_block = static_cast<uint32_t>(block_idx) >=
+                                     block_2_ctile_map_streamk.reduction_start_block_idx;
+                if(is_reduction_block)
+                {
+                    // descriptors
+                    constexpr auto cluster_length_reduce = GetClusterLengthReduction();
+                    constexpr auto reduce_desc = make_cluster_descriptor(cluster_length_reduce);
+                    const auto reduce_thread_cluster_idx =
+                        reduce_desc.CalculateBottomIndex(make_multi_index(block_idx));
+                    const auto thread_m_cluster_id = reduce_thread_cluster_idx[I0];
+                    const auto thread_n_cluster_id = reduce_thread_cluster_idx[I1];
+
+                    constexpr auto MReduceIters = math::integer_divide_ceil(
+                        Number<MPerBlock>{}, cluster_length_reduce.At(I0));
+                    constexpr auto NReduceIters = math::integer_divide_ceil(
+                        Number<NPerBlock>{},
+                        cluster_length_reduce.At(I1) *
+                            Number<CShuffleBlockTransferScalarPerVector_NPerBlock>{});
+
+                    constexpr auto acc_thread_buf_load_desc = make_naive_tensor_descriptor_packed(
+                        make_tuple(I1, Number<CShuffleBlockTransferScalarPerVector_NPerBlock>{}));
+                    constexpr auto acc_thread_buf_store_desc =
+                        make_naive_tensor_descriptor_packed(make_tuple(
+                            I1, I1, I1, Number<CShuffleBlockTransferScalarPerVector_NPerBlock>{}));
+
+                    constexpr auto c_partial_acc_block_m_n = GetPartialAccBlockDescriptor();
+
+                    constexpr auto partial_acc_load_step_n =
+                        make_multi_index(0,
+                                         cluster_length_reduce.At(I1) *
+                                             CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_load_step_n_reverse = make_multi_index(
+                        0,
+                        -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) *
+                            CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_load_step_m =
+                        make_multi_index(cluster_length_reduce.At(I0), 0);
+
+                    constexpr auto partial_acc_store_step_n =
+                        make_multi_index(0,
+                                         0,
+                                         0,
+                                         cluster_length_reduce.At(I1) *
+                                             CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_store_step_n_reverse = make_multi_index(
+                        0,
+                        0,
+                        0,
+                        -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) *
+                            CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_store_step_m =
+                        make_multi_index(0, cluster_length_reduce.At(I0), 0, 0);
+
+                    StaticBuffer<AddressSpaceEnum::Vgpr,
+                                 AccDataType,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock,
+                                 true>
+                        parcial_acc_buf;
+                    StaticBuffer<AddressSpaceEnum::Vgpr,
+                                 AccDataType,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock,
+                                 true>
+                        acc_buf;
+
+                    // start to compute
+                    auto reduction_idx =
+                        block_idx - block_2_ctile_map_streamk.reduction_start_block_idx;
+                    auto spatial_idx = block_2_ctile_map_streamk.tile_to_spatial(
+                        reduction_idx, problem.M, problem.N);
+
+                    workgroup_barrier wg_barrier(p_semaphore);
+
+                    uint32_t tile_acc_offset_start =
+                        block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx);
+                    uint32_t tile_acc_offset_end =
+                        block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx +
+                                                                                  1);
+                    __syncthreads();
+
+                    auto acc_load = ThreadwiseTensorSliceTransfer_v2<
+                        AccDataType,                        // SrcData,
+                        AccDataType,                        // DstData,
+                        decltype(c_partial_acc_block_m_n),  // SrcDesc,
+                        decltype(acc_thread_buf_load_desc), // DstDesc,
+                        Sequence<1,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths,
+                        Sequence<0, 1>,                                           // DimAccessOrder,
+                        1,                                                        // SrcVectorDim,
+                        CShuffleBlockTransferScalarPerVector_NPerBlock, // SrcScalarPerVector,
+                        1,                                              // SrcScalarStrideInVector,
+                        false // SrcResetCoordinateAfterRun,
+                        >{c_partial_acc_block_m_n,
+                          make_multi_index(thread_m_cluster_id,
+                                           thread_n_cluster_id *
+                                               CShuffleBlockTransferScalarPerVector_NPerBlock)};
+
+                    auto acc_store = ThreadwiseTensorSliceTransfer_v1r3<
+                        AccDataType,                                             // SrcData,
+                        CDataType,                                               // DstData,
+                        decltype(acc_thread_buf_store_desc),                     // SrcDesc,
+                        decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), // DstDesc,
+                        CElementwiseOperation, // ElementwiseOperation,
+                        Sequence<1,
+                                 1,
+                                 1,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths,
+                        Sequence<0, 1, 2, 3>,                                     // DimAccessOrder,
+                        3,                                                        // DstVectorDim,
+                        CShuffleBlockTransferScalarPerVector_NPerBlock, // DstScalarPerVector,
+                        InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum DstInMemOp,
+                        1,                              // DstScalarStrideInVector,
+                        false                           // DstResetCoordinateAfterRun,
+                        >{c_grid_desc_mblock_mperblock_nblock_nperblock,
+                          make_multi_index(__builtin_amdgcn_readfirstlane(spatial_idx[I0]),
+                                           thread_m_cluster_id,
+                                           __builtin_amdgcn_readfirstlane(spatial_idx[I1]),
+                                           thread_n_cluster_id *
+                                               CShuffleBlockTransferScalarPerVector_NPerBlock),
+                          CElementwiseOperation{}};
+
+                    wg_barrier.wait_eq(reduction_idx, tile_acc_offset_end - tile_acc_offset_start);
+
+                    if(threadIdx.x == 0)
+                    {
+                        p_semaphore[reduction_idx] = 0;
+                    }
+                    using Accumulation = ck::detail::
+                        AccumulateWithNanCheck<false /*PropagateNan*/, reduce::Add, AccDataType>;
+
+                    for(int i_m = 0; i_m < MReduceIters; i_m++)
+                    {
+                        static_for<0, NReduceIters, 1>{}([&](auto i_n_reduce) {
+                            acc_buf.Clear();
+                            for(auto i = tile_acc_offset_start; i < tile_acc_offset_end; i++)
+                            {
+                                auto c_partial_acc_buf =
+                                    make_dynamic_buffer<AddressSpaceEnum::Global,
+                                                        AmdBufferCoherenceEnum::GLC>(
+                                        reinterpret_cast<AccDataType*>(p_workspace) +
+                                            i * c_partial_acc_block_m_n.GetElementSpaceSize(),
+                                        c_partial_acc_block_m_n.GetElementSpaceSize());
+
+                                acc_load.Run(c_partial_acc_block_m_n,
+                                             c_partial_acc_buf,
+                                             acc_thread_buf_load_desc,
+                                             make_tuple(I0, I0),
+                                             parcial_acc_buf);
+
+                                static_for<0, CShuffleBlockTransferScalarPerVector_NPerBlock, 1>{}(
+                                    [&](auto i_vec) {
+                                        constexpr auto offset =
+                                            acc_thread_buf_load_desc.CalculateOffset(
+                                                make_tuple(0, i_vec));
+                                        Accumulation::Calculate(acc_buf(Number<offset>{}),
+                                                                parcial_acc_buf[Number<offset>{}]);
+                                    });
+                            }
+
+                            if(thread_n_cluster_id *
+                                   CShuffleBlockTransferScalarPerVector_NPerBlock <
+                               NPerBlock)
+                            {
+                                acc_store.Run(acc_thread_buf_store_desc,
+                                              make_tuple(I0, I0, I0, I0),
+                                              acc_buf,
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              c_grid_buf);
+                            }
+                            if constexpr(NReduceIters != 1)
+                            {
+                                if constexpr(i_n_reduce != (NReduceIters - 1))
+                                {
+                                    acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n,
+                                                                partial_acc_load_step_n);
+                                    acc_store.MoveDstSliceWindow(
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        partial_acc_store_step_n);
+                                }
+                                else
+                                {
+                                    acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n,
+                                                                partial_acc_load_step_n_reverse);
+                                    acc_store.MoveDstSliceWindow(
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        partial_acc_store_step_n_reverse);
+                                }
+                            }
+                        });
+                        {
+                            acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n,
+                                                        partial_acc_load_step_m);
+                            acc_store.MoveDstSliceWindow(
+                                c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                partial_acc_store_step_m);
+                        }
+                    }
+
+                    continue;
+                }
+            }
+
+            // offset for last acc buffer of this block
+            uint32_t block_acc_offset =
+                (block_2_ctile_map_streamk.get_acc_buffer_offset_from_block(block_idx + 1) - 1) *
+                MPerBlock * NPerBlock;
             while(true)
             {
                 uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
@@ -1173,33 +1459,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                     iter_end - 1, tile_idx, iter_offset);
                 iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
 
-                const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M,
-                                                                                 problem.MPadded,
-                                                                                 problem.K,
-                                                                                 problem.KPadded,
-                                                                                 problem.StrideA,
-                                                                                 problem.AK0);
-                const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(problem.K,
-                                                                                 problem.KPadded,
-                                                                                 problem.N,
-                                                                                 problem.NPadded,
-                                                                                 problem.StrideB,
-                                                                                 problem.BK0);
-                const auto c_grid_desc_m_n       = MakeCGridDescriptor_M_N(
-                    problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
-
-                const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-                    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-                auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-
-                const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-
-                const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
                 auto block_work_idx =
                     block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N);
 
@@ -1363,11 +1622,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                     constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                         GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
+                    constexpr auto c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle =
+                        GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle();
+
                     auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                         static_cast<CShuffleDataType*>(p_shared),
                         c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
                             .GetElementSpaceSize());
 
+                    auto c_partial_acc_buf =
+                        make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
+                            reinterpret_cast<AccDataType*>(p_workspace) + block_acc_offset,
+                            c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle
+                                .GetElementSpaceSize());
+
                     constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
                         transform_tensor_descriptor(
                             c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
@@ -1477,7 +1745,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                          c_grid_desc_mblock_mperblock_nblock_nperblock,
                          make_multi_index(block_m_id, 0, block_n_id, 0),
                          c_element_op};
-
+                    // LDS to global partial acc
+                    auto c_block_copy_lds_to_partial_acc = ThreadGroupTensorSliceTransfer_v6r1r2<
+                        ThisThreadBlock,       // index_t BlockSize,
+                        CElementwiseOperation, // ElementwiseOperation,
+                                               // InMemoryDataOperationEnum::Set, // DstInMemOp,
+                        Sequence<1,
+                                 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                 1,
+                                 CShuffleNXdlPerWavePerShuffle * NWave *
+                                     NPerXdl>, // BlockSliceLengths,
+                        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                        Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                        CShuffleDataType,     // typename SrcData,
+                        CShuffleDataType,     // typename DstData,
+                        decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                        decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle),
+                        Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                        3,                                              // index_t VectorDim,
+                        CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                        false, // bool ThreadTransferSrcResetCoordinateAfterRun, => need to be
+                               // false, othre wise has scratch
+                        false> // bool ThreadTransferDstResetCoordinateAfterRun, => need to be
+                               // false, othre wise has scratch
+                        {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                         make_multi_index(0, 0, 0, 0),
+                         c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle,
+                         make_multi_index(0, 0, 0, 0),
+                         c_element_op};
                     // space filling curve for threadwise C in VGPR
                     constexpr auto sfc_c_vgpr =
                         SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
@@ -1535,15 +1830,40 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         }
                         else if(is_sk_block)
                         {
-                            // each block copy its data from LDS to global
-                            c_shuffle_block_copy_lds_to_global
-                                .template Run<decltype(c_shuffle_block_buf),
-                                              decltype(c_grid_buf),
-                                              InMemoryDataOperationEnum::AtomicAdd>(
+                            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                                         StreamKReductionStrategy::Atomic)
+                            {
+                                // each block copy its data from LDS to global
+                                c_shuffle_block_copy_lds_to_global
+                                    .template Run<decltype(c_shuffle_block_buf),
+                                                  decltype(c_grid_buf),
+                                                  InMemoryDataOperationEnum::AtomicAdd>(
+                                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                                        c_shuffle_block_buf,
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        c_grid_buf);
+                            }
+                            else if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                                              StreamKReductionStrategy::Reduction)
+                            {
+                                // constexpr offset
+                                c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin(
                                     c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
-                                    c_shuffle_block_buf,
-                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                    c_grid_buf);
+                                    make_tuple(0, 0, 0, 0));
+
+                                c_block_copy_lds_to_partial_acc.SetDstSliceOrigin(
+                                    c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle,
+                                    make_tuple(MXdlPerWave, 0, NXdlPerWave, 0));
+
+                                c_block_copy_lds_to_partial_acc
+                                    .template Run<decltype(c_shuffle_block_buf),
+                                                  decltype(c_partial_acc_buf),
+                                                  InMemoryDataOperationEnum::Set>(
+                                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                                        c_shuffle_block_buf,
+                                        c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle,
+                                        c_partial_acc_buf);
+                            }
                         }
 
                         if constexpr(access_id < num_access - 1)
@@ -1555,15 +1875,33 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                 c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
                         }
                     });
-                }
+
+                    if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                                 StreamKReductionStrategy::Reduction)
+                    {
+                        if(is_sk_block)
+                        {
+                            // increase the counter for this tile
+                            workgroup_barrier wg_barrier(p_semaphore);
+                            wg_barrier.inc(tile_idx);
+                        }
+                    }
+                } // shuffle c and write-out end
+
                 // exit condition
                 iter_end -= current_iter_length;
                 if(iter_end <= iter_start)
                     break;
+                if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                             StreamKReductionStrategy::Reduction)
+                {
+                    block_acc_offset -= MPerBlock * NPerBlock;
+                }
                 // make sure next loop LDS is ready for use
                 block_sync_lds();
-            }
-        }
+            } // while loop
+
+        } // for loop
     }
 
     template <bool HasMainKBlockLoop,
@@ -1574,19 +1912,43 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                     CDataType* p_c_grid,
                                     void* p_shared_0,
                                     void* p_shared_1,
-                                    Problem& problem)
+                                    Problem& problem,
+                                    void* p_workspace)
     {
 
         const AElementwiseOperation a_element_op{};
         const BElementwiseOperation b_element_op{};
         const CElementwiseOperation c_element_op{};
 
-        Block2CTileMap_streamk block_2_ctile_map_streamk(
-            problem.M, problem.N, AK0Number * problem.KPadded, problem.Grid_size);
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
         uint32_t iter_start, iter_end;
-        bool is_sk_block, is_dp_block; //, is_padding_block; //, is_reduction_block;
+        bool is_sk_block, is_dp_block, is_reduction_block;
         index_t num_k_block_main_loop;
 
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M,
+                                                         problem.N,
+                                                         AK0Number * problem.KPadded,
+                                                         problem.Grid_size,
+                                                         problem.Streamk_sel);
         for(auto block_idx = get_block_1d_id();
             block_idx < block_2_ctile_map_streamk.get_grid_dims();
             block_idx += gridDim.x)
@@ -1601,6 +1963,235 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
             num_k_block_main_loop = iter_end - iter_start;
 
+            uint32_t* p_semaphore = reinterpret_cast<uint32_t*>(
+                reinterpret_cast<char*>(p_workspace) +
+                block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType)));
+
+            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                         StreamKReductionStrategy::Reduction)
+            {
+                is_reduction_block = static_cast<uint32_t>(block_idx) >=
+                                     block_2_ctile_map_streamk.reduction_start_block_idx;
+                if(is_reduction_block)
+                {
+                    // descriptors
+                    constexpr auto cluster_length_reduce = GetClusterLengthReduction();
+                    constexpr auto reduce_desc = make_cluster_descriptor(cluster_length_reduce);
+                    const auto reduce_thread_cluster_idx =
+                        reduce_desc.CalculateBottomIndex(make_multi_index(block_idx));
+                    const auto thread_m_cluster_id = reduce_thread_cluster_idx[I0];
+                    const auto thread_n_cluster_id = reduce_thread_cluster_idx[I1];
+
+                    constexpr auto MReduceIters = math::integer_divide_ceil(
+                        Number<MPerBlock>{}, cluster_length_reduce.At(I0));
+                    constexpr auto NReduceIters = math::integer_divide_ceil(
+                        Number<NPerBlock>{},
+                        cluster_length_reduce.At(I1) *
+                            Number<CShuffleBlockTransferScalarPerVector_NPerBlock>{});
+
+                    constexpr auto acc_thread_buf_load_desc = make_naive_tensor_descriptor_packed(
+                        make_tuple(I1, Number<CShuffleBlockTransferScalarPerVector_NPerBlock>{}));
+                    constexpr auto acc_thread_buf_store_desc =
+                        make_naive_tensor_descriptor_packed(make_tuple(
+                            I1, I1, I1, Number<CShuffleBlockTransferScalarPerVector_NPerBlock>{}));
+
+                    constexpr auto c_partial_acc_block_m_n = GetPartialAccBlockDescriptor();
+
+                    constexpr auto partial_acc_load_step_n =
+                        make_multi_index(0,
+                                         cluster_length_reduce.At(I1) *
+                                             CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_load_step_n_reverse = make_multi_index(
+                        0,
+                        -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) *
+                            CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_load_step_m =
+                        make_multi_index(cluster_length_reduce.At(I0), 0);
+
+                    constexpr auto partial_acc_store_step_n =
+                        make_multi_index(0,
+                                         0,
+                                         0,
+                                         cluster_length_reduce.At(I1) *
+                                             CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_store_step_n_reverse = make_multi_index(
+                        0,
+                        0,
+                        0,
+                        -1 * cluster_length_reduce.At(I1).value * (NReduceIters - 1) *
+                            CShuffleBlockTransferScalarPerVector_NPerBlock);
+                    constexpr auto partial_acc_store_step_m =
+                        make_multi_index(0, cluster_length_reduce.At(I0), 0, 0);
+
+                    StaticBuffer<AddressSpaceEnum::Vgpr,
+                                 AccDataType,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock,
+                                 true>
+                        parcial_acc_buf;
+                    StaticBuffer<AddressSpaceEnum::Vgpr,
+                                 AccDataType,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock,
+                                 true>
+                        acc_buf;
+
+                    // start to compute
+                    auto reduction_idx =
+                        block_idx - block_2_ctile_map_streamk.reduction_start_block_idx;
+                    auto spatial_idx = block_2_ctile_map_streamk.tile_to_spatial(
+                        reduction_idx, problem.M, problem.N);
+
+                    workgroup_barrier wg_barrier(p_semaphore);
+
+                    uint32_t tile_acc_offset_start =
+                        block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx);
+                    uint32_t tile_acc_offset_end =
+                        block_2_ctile_map_streamk.get_acc_buffer_offset_from_tile(reduction_idx +
+                                                                                  1);
+
+                    uint32_t expected_count = tile_acc_offset_end - tile_acc_offset_start;
+
+                    if(threadIdx.x == 0)
+                    {
+                        p_semaphore[reduction_idx] = 0;
+                    }
+
+                    __syncthreads();
+
+                    auto acc_load = ThreadwiseTensorSliceTransfer_v2<
+                        AccDataType,                        // SrcData,
+                        AccDataType,                        // DstData,
+                        decltype(c_partial_acc_block_m_n),  // SrcDesc,
+                        decltype(acc_thread_buf_load_desc), // DstDesc,
+                        Sequence<1,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths,
+                        Sequence<0, 1>,                                           // DimAccessOrder,
+                        1,                                                        // SrcVectorDim,
+                        CShuffleBlockTransferScalarPerVector_NPerBlock, // SrcScalarPerVector,
+                        1,                                              // SrcScalarStrideInVector,
+                        false // SrcResetCoordinateAfterRun,
+                        >{c_partial_acc_block_m_n,
+                          make_multi_index(thread_m_cluster_id,
+                                           thread_n_cluster_id *
+                                               CShuffleBlockTransferScalarPerVector_NPerBlock)};
+
+                    auto acc_store = ThreadwiseTensorSliceTransfer_v1r3<
+                        AccDataType,                                             // SrcData,
+                        CDataType,                                               // DstData,
+                        decltype(acc_thread_buf_store_desc),                     // SrcDesc,
+                        decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), // DstDesc,
+                        CElementwiseOperation, // ElementwiseOperation,
+                        Sequence<1,
+                                 1,
+                                 1,
+                                 CShuffleBlockTransferScalarPerVector_NPerBlock>, // SliceLengths,
+                        Sequence<0, 1, 2, 3>,                                     // DimAccessOrder,
+                        3,                                                        // DstVectorDim,
+                        CShuffleBlockTransferScalarPerVector_NPerBlock, // DstScalarPerVector,
+                        InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum DstInMemOp,
+                        1,                              // DstScalarStrideInVector,
+                        false                           // DstResetCoordinateAfterRun,
+                        >{c_grid_desc_mblock_mperblock_nblock_nperblock,
+                          make_multi_index(__builtin_amdgcn_readfirstlane(spatial_idx[I0]),
+                                           thread_m_cluster_id,
+                                           __builtin_amdgcn_readfirstlane(spatial_idx[I1]),
+                                           thread_n_cluster_id *
+                                               CShuffleBlockTransferScalarPerVector_NPerBlock),
+                          CElementwiseOperation{}};
+
+#if 0
+                if(threadIdx.x == 0) {
+                    printf("bid:%d, rid:%d, os:%d,%d, spatial:%d,%d\n", static_cast<int>(blockIdx.x),
+                        reduction_idx, __builtin_amdgcn_readfirstlane(tile_acc_offset_start), __builtin_amdgcn_readfirstlane(tile_acc_offset_end),
+                        __builtin_amdgcn_readfirstlane(spatial_idx[I0]),
+                        __builtin_amdgcn_readfirstlane(spatial_idx[I1]));
+                }
+#endif
+                    if(threadIdx.x == 0)
+                    {
+                        atomicAdd(&p_semaphore[reduction_idx], 1);
+                    }
+
+                    wg_barrier.wait_eq(p_semaphore[reduction_idx], expected_count);
+                    using Accumulation = ck::detail::
+                        AccumulateWithNanCheck<false /*PropagateNan*/, reduce::Add, AccDataType>;
+
+                    for(int i_m = 0; i_m < MReduceIters; i_m++)
+                    {
+                        static_for<0, NReduceIters, 1>{}([&](auto i_n_reduce) {
+                            acc_buf.Clear();
+                            for(auto i = tile_acc_offset_start; i < tile_acc_offset_end; i++)
+                            {
+                                auto c_partial_acc_buf =
+                                    make_dynamic_buffer<AddressSpaceEnum::Global,
+                                                        AmdBufferCoherenceEnum::GLC>(
+                                        reinterpret_cast<AccDataType*>(p_workspace) +
+                                            i * c_partial_acc_block_m_n.GetElementSpaceSize(),
+                                        c_partial_acc_block_m_n.GetElementSpaceSize());
+
+                                acc_load.Run(c_partial_acc_block_m_n,
+                                             c_partial_acc_buf,
+                                             acc_thread_buf_load_desc,
+                                             make_tuple(I0, I0),
+                                             parcial_acc_buf);
+
+                                static_for<0, CShuffleBlockTransferScalarPerVector_NPerBlock, 1>{}(
+                                    [&](auto i_vec) {
+                                        constexpr auto offset =
+                                            acc_thread_buf_load_desc.CalculateOffset(
+                                                make_tuple(0, i_vec));
+                                        Accumulation::Calculate(acc_buf(Number<offset>{}),
+                                                                parcial_acc_buf[Number<offset>{}]);
+                                    });
+                            }
+
+                            if(thread_n_cluster_id *
+                                   CShuffleBlockTransferScalarPerVector_NPerBlock <
+                               NPerBlock)
+                            {
+                                acc_store.Run(acc_thread_buf_store_desc,
+                                              make_tuple(I0, I0, I0, I0),
+                                              acc_buf,
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              c_grid_buf);
+                            }
+                            if constexpr(NReduceIters != 1)
+                            {
+                                if constexpr(i_n_reduce != (NReduceIters - 1))
+                                {
+                                    acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n,
+                                                                partial_acc_load_step_n);
+                                    acc_store.MoveDstSliceWindow(
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        partial_acc_store_step_n);
+                                }
+                                else
+                                {
+                                    acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n,
+                                                                partial_acc_load_step_n_reverse);
+                                    acc_store.MoveDstSliceWindow(
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        partial_acc_store_step_n_reverse);
+                                }
+                            }
+                        });
+                        {
+                            acc_load.MoveSrcSliceWindow(c_partial_acc_block_m_n,
+                                                        partial_acc_load_step_m);
+                            acc_store.MoveDstSliceWindow(
+                                c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                partial_acc_store_step_m);
+                        }
+                    }
+
+                    continue;
+                }
+            }
+
+            // offset for last acc buffer of this block
+            uint32_t block_acc_offset =
+                (block_2_ctile_map_streamk.get_acc_buffer_offset_from_block(block_idx + 1) - 1) *
+                MPerBlock * NPerBlock;
+            while(true)
             {
 
                 uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
@@ -1611,33 +2202,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                     iter_end - 1, tile_idx, iter_offset);
                 iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
 
-                const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M,
-                                                                                 problem.MPadded,
-                                                                                 problem.K,
-                                                                                 problem.KPadded,
-                                                                                 problem.StrideA,
-                                                                                 problem.AK0);
-                const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(problem.K,
-                                                                                 problem.KPadded,
-                                                                                 problem.N,
-                                                                                 problem.NPadded,
-                                                                                 problem.StrideB,
-                                                                                 problem.BK0);
-                const auto c_grid_desc_m_n       = MakeCGridDescriptor_M_N(
-                    problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
-
-                const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-                    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-
-                auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-
-                const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-                const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
                 auto block_work_idx =
                     block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N);
 
@@ -1811,11 +2375,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                     constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                         GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
+                    constexpr auto c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle =
+                        GetCBlockDescriptor_MShuffle_MPerShuffle_NShuffle_NPerShuffle();
+
                     auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
                         static_cast<CShuffleDataType*>(p_shared_0),
                         c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
                             .GetElementSpaceSize());
 
+                    auto c_partial_acc_buf =
+                        make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
+                            reinterpret_cast<AccDataType*>(p_workspace) + block_acc_offset,
+                            c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle
+                                .GetElementSpaceSize());
+
                     constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
                         transform_tensor_descriptor(
                             c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
@@ -1925,6 +2498,35 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                          make_multi_index(block_m_id, 0, block_n_id, 0),
                          c_element_op};
 
+                    // LDS to global partial acc
+                    auto c_block_copy_lds_to_partial_acc = ThreadGroupTensorSliceTransfer_v6r1r2<
+                        ThisThreadBlock,       // index_t BlockSize,
+                        CElementwiseOperation, // ElementwiseOperation,
+                                               // InMemoryDataOperationEnum::Set, // DstInMemOp,
+                        Sequence<1,
+                                 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                 1,
+                                 CShuffleNXdlPerWavePerShuffle * NWave *
+                                     NPerXdl>, // BlockSliceLengths,
+                        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                        Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                        CShuffleDataType,     // typename SrcData,
+                        CShuffleDataType,     // typename DstData,
+                        decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                        decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle),
+                        Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                        3,                                              // index_t VectorDim,
+                        CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                        false, // bool ThreadTransferSrcResetCoordinateAfterRun, => need to be
+                               // false, othre wise has scratch
+                        false> // bool ThreadTransferDstResetCoordinateAfterRun, => need to be
+                               // false, othre wise has scratch
+                        {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                         make_multi_index(0, 0, 0, 0),
+                         c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle,
+                         make_multi_index(0, 0, 0, 0),
+                         c_element_op};
+
                     // space filling curve for threadwise C in VGPR
                     constexpr auto sfc_c_vgpr =
                         SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
@@ -1982,15 +2584,40 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         }
                         else if(is_sk_block)
                         {
-                            // each block copy its data from LDS to global
-                            c_shuffle_block_copy_lds_to_global
-                                .template Run<decltype(c_shuffle_block_buf),
-                                              decltype(c_grid_buf),
-                                              InMemoryDataOperationEnum::AtomicAdd>(
+                            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                                         StreamKReductionStrategy::Atomic)
+                            {
+                                // each block copy its data from LDS to global
+                                c_shuffle_block_copy_lds_to_global
+                                    .template Run<decltype(c_shuffle_block_buf),
+                                                  decltype(c_grid_buf),
+                                                  InMemoryDataOperationEnum::AtomicAdd>(
+                                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                                        c_shuffle_block_buf,
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        c_grid_buf);
+                            }
+                            else if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                                              StreamKReductionStrategy::Reduction)
+                            {
+                                // constexpr offset
+                                c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin(
                                     c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
-                                    c_shuffle_block_buf,
-                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                    c_grid_buf);
+                                    make_tuple(0, 0, 0, 0));
+
+                                c_block_copy_lds_to_partial_acc.SetDstSliceOrigin(
+                                    c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle,
+                                    make_tuple(MXdlPerWave, 0, NXdlPerWave, 0));
+
+                                c_block_copy_lds_to_partial_acc
+                                    .template Run<decltype(c_shuffle_block_buf),
+                                                  decltype(c_partial_acc_buf),
+                                                  InMemoryDataOperationEnum::Set>(
+                                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                                        c_shuffle_block_buf,
+                                        c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle,
+                                        c_partial_acc_buf);
+                            }
                         }
                         if constexpr(access_id < num_access - 1)
                         {
@@ -2002,6 +2629,27 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         }
                     });
                 }
+                // exit condition
+                iter_end -= current_iter_length;
+                if(iter_end <= iter_start)
+                    break;
+                if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                             StreamKReductionStrategy::Reduction)
+                {
+                    block_acc_offset -= MPerBlock * NPerBlock;
+                }
+                // make sure next loop LDS is ready for use
+                block_sync_lds();
+            }
+            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
+                         StreamKReductionStrategy::Reduction)
+            {
+                if(is_sk_block)
+                {
+                    // increase the counter for this tile
+                    workgroup_barrier wg_barrier(p_semaphore);
+                    wg_barrier.inc(0);
+                }
             }
         }
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
index 19fa6c209..f44c02517 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
@@ -237,6 +237,206 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin
                                                       PassThrough,
                                                       PassThrough>>>& instances);
 #endif
+
+#if(defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
@@ -327,6 +527,121 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
         }
 #endif
 
+#if(defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
         return op_ptrs;
     }
 };
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 6a1558a52..2c0b6c7b7 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -87,6 +87,12 @@ function(add_instance_library INSTANCE_NAME)
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
+    foreach(source IN LISTS ARGN)
+    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal_streamk" AND source MATCHES "_f8_")
+         message("removing gemm_universal_streamk_f8 instance ${source} ")
+         list(REMOVE_ITEM ARGN "${source}")
+    endif()
+    endforeach()
     endif()
     #only continue if there are some source files left on the list
     if(ARGN)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
index 2a930ab9a..08746a52d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
@@ -21,6 +21,49 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp)
+
+        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
+
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
+
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp)
 
 add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp
new file mode 100644
index 000000000..d03002af5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   4,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        #endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   4,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   4,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,   128,   8,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,   128,   8,   4,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        #endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 000000000..239d3a67f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 000000000..9b65bbe9b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 000000000..38cda9bf8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 000000000..2afa4d5d6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp
new file mode 100644
index 000000000..0f7dad4c5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 000000000..596817694
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100644
index 000000000..c4423e457
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..06f701f48
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 000000000..fda53c689
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100755
index 000000000..9272c74d7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp
new file mode 100644
index 000000000..7736f38cb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        #endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,   128,   8,   16,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,   128,   8,   16,  16,   16,    4,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,   128,   8,   16,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,   128,   8,   16,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,   128,   8,   16,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,   128,   8,   16,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,   128,   8,   16,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,   128,   8,   16,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,   128,   8,   16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        #endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 000000000..4701d951a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 000000000..cb57860da
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 000000000..67be95888
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100755
index 000000000..f9e46a5f2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100644
index 000000000..419fcebdd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 000000000..7cbbc1813
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100644
index 000000000..e3ae25828
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..0c6aa0a4e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 000000000..75871166a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100644
index 000000000..8c91bc877
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp
new file mode 100644
index 000000000..57b6ab3ae
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,   128,    16,   8,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   192,   256,    64,    16,   8,  32,   32,    3,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,    16,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<8,  32, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,        
+        // We prefer following instance, however, existing compiler bug cause it failed to generate sanity code.
+        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        #endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,  16,   2,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,  16,   4,  16,   16,    1,    1,     S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,   128,  16,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<64, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,   128,  16,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<64, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,   128,  16,   4,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,   128,  16,   2,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,   128,  16,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,   128,  16,   2,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,  16,   2,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,    F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,  16,   4,  16,   16,    1,    1,     S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,              16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        #endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 000000000..51a51d3c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 000000000..7613f5076
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 000000000..d015086f3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 000000000..4cb327f4f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
new file mode 100644
index 000000000..19b49c1f3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 000000000..9dd02b6e9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100644
index 000000000..e54568eaa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..cd1e17648
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 000000000..7996c4441
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100755
index 000000000..c2544be5f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp
new file mode 100644
index 000000000..14bd36d29
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,  16,   8,  16,   16,    8,    7,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        #endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances = std::tuple<
+// clang-format off
+    #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   16,   8,  16,   16,    1,    1,     S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   16,   8,  16,   16,    1,    1,     S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,   128,   16,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,   128,   16,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,   128,   16,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,   128,   16,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,   128,   16,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,   128,   16,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   16,   8,  16,   16,    1,    1,     S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   16,   8,  16,   16,    1,    1,     S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,   128,   16,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,   128,   16,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,   128,   16,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,   128,   16,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        #endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 000000000..eefc77615
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 000000000..185874b24
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 000000000..a92181ccc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100755
index 000000000..1551dba0f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100644
index 000000000..0f3e51db1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 000000000..f87b8f670
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100644
index 000000000..0058a2ad6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..3a3bd5df9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 000000000..fb50e2589
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100644
index 000000000..6413655b6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt
deleted file mode 100644
index 2a930ab9a..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# ONLY XDL_KERNELS
-set(GEMM_UNIVERSAL_STREAMK_INSTANCES)
-
-list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES 
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp)
-
-add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
deleted file mode 100644
index 6e8d5c798..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16 = half_t;
-using F32 = float;
-
-using Row = tensor_layout::gemm::RowMajor;
-using Col = tensor_layout::gemm::ColumnMajor;
-
-template <index_t... Is>
-using S = Sequence<Is...>;
-
-using PassThrough = element_wise::PassThrough;
-
-static constexpr auto GemmDefault    = GemmSpecialization::Default;
-static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
-static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
-static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
-
-static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
-static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
-
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
-
-template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
-        // Latency friendly
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        // Memory friendly
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   4,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-    // clang-format on
-    >;
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
deleted file mode 100644
index 6adcb8f4f..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances<GemmDefault>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
deleted file mode 100644
index 631ae6872..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
deleted file mode 100644
index 2c49773a6..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
deleted file mode 100644
index 39d54fb88..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
deleted file mode 100644
index 8ee50d63c..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances<Intrawave,
-                                                                             GemmDefault>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
deleted file mode 100644
index d31e0819a..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances<Intrawave,
-                                                                             GemmKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
deleted file mode 100644
index fe19f35e5..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances<Intrawave,
-                                                                             GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
deleted file mode 100644
index 6c1873b37..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances<Interwave,
-                                                                             GemmDefault>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
deleted file mode 100644
index ffd53f406..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances<Interwave,
-                                                                             GemmKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
deleted file mode 100644
index 094b8f92f..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Row,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances<Interwave,
-                                                                             GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
deleted file mode 100644
index e00c1733e..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16 = half_t;
-using F32 = float;
-
-using Row = tensor_layout::gemm::RowMajor;
-using Col = tensor_layout::gemm::ColumnMajor;
-
-template <index_t... Is>
-using S = Sequence<Is...>;
-
-using PassThrough = element_wise::PassThrough;
-
-static constexpr auto GemmDefault    = GemmSpecialization::Default;
-static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
-static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
-static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
-
-static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
-static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
-
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
-        // Compute friendly
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // AGPR Spill
-        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        // AGPR Spill when use permuted lds layout. so, use padding for these two.
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
-
-template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
-        // Latency friendly 
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        // Memory friendly
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-    // clang-format on
-    >;
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
deleted file mode 100644
index 546f909b3..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmDefault>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
deleted file mode 100644
index d91de96be..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
deleted file mode 100644
index c70678b44..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
deleted file mode 100644
index 5410a0cc2..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmMNPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
deleted file mode 100644
index 4ae7329f9..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances<Intrawave,
-                                                                             GemmDefault>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
deleted file mode 100644
index 4fc5458a9..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances<Intrawave,
-                                                                             GemmKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
deleted file mode 100644
index 7369f87a5..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances<Intrawave,
-                                                                             GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
deleted file mode 100644
index 45425a41a..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances<Interwave,
-                                                                             GemmDefault>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
deleted file mode 100644
index 3b5ac0366..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances<Interwave,
-                                                                             GemmKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
deleted file mode 100644
index 53aa011a7..000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
-                                                      Col,
-                                                      Row,
-                                                      F16,
-                                                      F16,
-                                                      F16,
-                                                      PassThrough,
-                                                      PassThrough,
-                                                      PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances<Interwave,
-                                                                             GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/modified_files.txt b/modified_files.txt
new file mode 100755
index 000000000..34a42e3f3
--- /dev/null
+++ b/modified_files.txt
@@ -0,0 +1,10 @@
+example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
+example/01_gemm/run_gemm_example_streamk_v2.inc
+include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+profiler/src/profile_gemm_universal_streamk.cpp
+modified_files.txt
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100644
new mode 100755
index cd3f5787d..85f6c2577
--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -85,8 +85,10 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
     using F32 = float;
     using F16 = ck::half_t;
-    // using BF16 = ck::bhalf_t;
-    // using F8   = ck::f8_t;
+
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    using F8 = ck::f8_t;
+#endif
 
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -145,6 +147,24 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
     {
         return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+#endif
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
-- 
GitLab


From 4c7035ff08f17aa138a747b8ea00ccf47276d85c Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 22 Nov 2024 08:30:01 -0800
Subject: [PATCH 070/153] fix path of ninjatracing (#1685)

---
 Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b06726335..76e6f0ebe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,17 +87,17 @@ RUN pip install --upgrade cmake==3.27.5 && \
     git clone https://github.com/ccache/ccache.git && \
     cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
+    cd / && \
     wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
     gunzip /usr/local/bin/ninja.gz && \
     chmod a+x /usr/local/bin/ninja && \
     git clone https://github.com/nico/ninjatracing.git && \
 #Install latest cppcheck
     git clone https://github.com/danmar/cppcheck.git && \
-    cd cppcheck && mkdir build && cd build && cmake .. && cmake --build .
-WORKDIR /
-
+    cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \
+    cd / && \
 # Install an init system
-RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
+    wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
     dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
     pip3 install --upgrade pip && \
-- 
GitLab


From ff92222f937b54955011d394f46130fc5002110c Mon Sep 17 00:00:00 2001
From: schung-amd <Steven.Chung@amd.com>
Date: Fri, 22 Nov 2024 17:51:35 -0500
Subject: [PATCH 071/153] [CK_TILE] MakeKargs overloads for backward
 compatibility (#1681)

* Add overloads for MakeKargs

Overload MakeKargs to accept std::tuple<uint64_t, uint64_t> and std::tuple<void*, void*> to preserve functionality of code currently passing in list initializers or tuples.

* Add overloads for MakeKargs

Overload MakeKargs to accept std::tuple<uint64_t, uint64_t> and std::tuple<void*, void*> to preserve functionality of code currently passing in list initializers or tuples.

* Re-format files using ck_tile remod.py

---------

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       | 444 ++++++++++++++++++
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       | 338 +++++++++++++
 2 files changed, 782 insertions(+)

diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index c5858a20f..ccf15ee60 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -470,6 +470,248 @@ struct FmhaBwdDQDKDVKernel
         return kargs;
     }
 
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = !kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              const void* lse_ptr,
+              const void* do_ptr,
+              const void* d_ptr,
+              void* rand_val_ptr,
+              void* dk_ptr,
+              void* dv_ptr,
+              void* dbias_ptr,
+              void* dq_acc_ptr,
+              ck_tile::index_t seqlen_q,
+              ck_tile::index_t seqlen_k,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_do,
+              ck_tile::index_t stride_dq_acc,
+              ck_tile::index_t stride_dk,
+              ck_tile::index_t stride_dv,
+              ck_tile::index_t stride_dbias,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_do,
+              ck_tile::index_t nhead_stride_lsed,
+              ck_tile::index_t nhead_stride_dq_acc,
+              ck_tile::index_t nhead_stride_dk,
+              ck_tile::index_t nhead_stride_dv,
+              ck_tile::index_t nhead_stride_dbias,
+              ck_tile::index_t batch_stride_q,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t batch_stride_bias,
+              ck_tile::index_t batch_stride_randval,
+              ck_tile::index_t batch_stride_do,
+              ck_tile::index_t batch_stride_lsed,
+              ck_tile::index_t batch_stride_dq_acc,
+              ck_tile::index_t batch_stride_dk,
+              ck_tile::index_t batch_stride_dv,
+              ck_tile::index_t batch_stride_dbias,
+              ck_tile::index_t split_stride_dq_acc,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
+    {
+        return MakeKargs(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            lse_ptr,
+            do_ptr,
+            d_ptr,
+            rand_val_ptr,
+            dk_ptr,
+            dv_ptr,
+            dbias_ptr,
+            dq_acc_ptr,
+            seqlen_q,
+            seqlen_k,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_do,
+            stride_dq_acc,
+            stride_dk,
+            stride_dv,
+            stride_dbias,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_do,
+            nhead_stride_lsed,
+            nhead_stride_dq_acc,
+            nhead_stride_dk,
+            nhead_stride_dv,
+            nhead_stride_dbias,
+            batch_stride_q,
+            batch_stride_k,
+            batch_stride_v,
+            batch_stride_bias,
+            batch_stride_randval,
+            batch_stride_do,
+            batch_stride_lsed,
+            batch_stride_dq_acc,
+            batch_stride_dk,
+            batch_stride_dv,
+            batch_stride_dbias,
+            split_stride_dq_acc,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = !kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              const void* lse_ptr,
+              const void* do_ptr,
+              const void* d_ptr,
+              void* rand_val_ptr,
+              void* dk_ptr,
+              void* dv_ptr,
+              void* dbias_ptr,
+              void* dq_acc_ptr,
+              ck_tile::index_t seqlen_q,
+              ck_tile::index_t seqlen_k,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_do,
+              ck_tile::index_t stride_dq_acc,
+              ck_tile::index_t stride_dk,
+              ck_tile::index_t stride_dv,
+              ck_tile::index_t stride_dbias,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_do,
+              ck_tile::index_t nhead_stride_lsed,
+              ck_tile::index_t nhead_stride_dq_acc,
+              ck_tile::index_t nhead_stride_dk,
+              ck_tile::index_t nhead_stride_dv,
+              ck_tile::index_t nhead_stride_dbias,
+              ck_tile::index_t batch_stride_q,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t batch_stride_bias,
+              ck_tile::index_t batch_stride_randval,
+              ck_tile::index_t batch_stride_do,
+              ck_tile::index_t batch_stride_lsed,
+              ck_tile::index_t batch_stride_dq_acc,
+              ck_tile::index_t batch_stride_dk,
+              ck_tile::index_t batch_stride_dv,
+              ck_tile::index_t batch_stride_dbias,
+              ck_tile::index_t split_stride_dq_acc,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              const std::tuple<void*, void*>& drop_seed_offset)
+    {
+        return MakeKargs(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            lse_ptr,
+            do_ptr,
+            d_ptr,
+            rand_val_ptr,
+            dk_ptr,
+            dv_ptr,
+            dbias_ptr,
+            dq_acc_ptr,
+            seqlen_q,
+            seqlen_k,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_do,
+            stride_dq_acc,
+            stride_dk,
+            stride_dv,
+            stride_dbias,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_do,
+            nhead_stride_lsed,
+            nhead_stride_dq_acc,
+            nhead_stride_dk,
+            nhead_stride_dv,
+            nhead_stride_dbias,
+            batch_stride_q,
+            batch_stride_k,
+            batch_stride_v,
+            batch_stride_bias,
+            batch_stride_randval,
+            batch_stride_do,
+            batch_stride_lsed,
+            batch_stride_dq_acc,
+            batch_stride_dk,
+            batch_stride_dv,
+            batch_stride_dbias,
+            split_stride_dq_acc,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
     template <bool Cond = kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
@@ -616,6 +858,208 @@ struct FmhaBwdDQDKDVKernel
         return kargs;
     }
 
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              const void* lse_ptr,
+              const void* do_ptr,
+              const void* d_ptr,
+              void* rand_val_ptr,
+              void* dk_ptr,
+              void* dv_ptr,
+              void* dbias_ptr,
+              void* dq_acc_ptr,
+              const void* seqstart_q_ptr,
+              const void* seqstart_k_ptr,
+              const void* seqlen_k_ptr,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_do,
+              ck_tile::index_t stride_dq_acc,
+              ck_tile::index_t stride_dk,
+              ck_tile::index_t stride_dv,
+              ck_tile::index_t stride_dbias,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_do,
+              ck_tile::index_t nhead_stride_lsed,
+              ck_tile::index_t nhead_stride_dq_acc,
+              ck_tile::index_t nhead_stride_dk,
+              ck_tile::index_t nhead_stride_dv,
+              ck_tile::index_t nhead_stride_dbias,
+              ck_tile::index_t split_stride_dq_acc,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
+    {
+        return MakeKargs(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            lse_ptr,
+            do_ptr,
+            d_ptr,
+            rand_val_ptr,
+            dk_ptr,
+            dv_ptr,
+            dbias_ptr,
+            dq_acc_ptr,
+            seqstart_q_ptr,
+            seqstart_k_ptr,
+            seqlen_k_ptr,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_do,
+            stride_dq_acc,
+            stride_dk,
+            stride_dv,
+            stride_dbias,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_do,
+            nhead_stride_lsed,
+            nhead_stride_dq_acc,
+            nhead_stride_dk,
+            nhead_stride_dv,
+            nhead_stride_dbias,
+            split_stride_dq_acc,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              const void* lse_ptr,
+              const void* do_ptr,
+              const void* d_ptr,
+              void* rand_val_ptr,
+              void* dk_ptr,
+              void* dv_ptr,
+              void* dbias_ptr,
+              void* dq_acc_ptr,
+              const void* seqstart_q_ptr,
+              const void* seqstart_k_ptr,
+              const void* seqlen_k_ptr,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_do,
+              ck_tile::index_t stride_dq_acc,
+              ck_tile::index_t stride_dk,
+              ck_tile::index_t stride_dv,
+              ck_tile::index_t stride_dbias,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_do,
+              ck_tile::index_t nhead_stride_lsed,
+              ck_tile::index_t nhead_stride_dq_acc,
+              ck_tile::index_t nhead_stride_dk,
+              ck_tile::index_t nhead_stride_dv,
+              ck_tile::index_t nhead_stride_dbias,
+              ck_tile::index_t split_stride_dq_acc,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              const std::tuple<void*, void*>& drop_seed_offset)
+    {
+        return MakeKargs(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            lse_ptr,
+            do_ptr,
+            d_ptr,
+            rand_val_ptr,
+            dk_ptr,
+            dv_ptr,
+            dbias_ptr,
+            dq_acc_ptr,
+            seqstart_q_ptr,
+            seqstart_k_ptr,
+            seqlen_k_ptr,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_do,
+            stride_dq_acc,
+            stride_dk,
+            stride_dv,
+            stride_dbias,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_do,
+            nhead_stride_lsed,
+            nhead_stride_dq_acc,
+            nhead_stride_dk,
+            nhead_stride_dv,
+            nhead_stride_dbias,
+            split_stride_dq_acc,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
     CK_TILE_HOST static constexpr auto
     GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_k_)
     {
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index e0c145fde..4443a4503 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -399,6 +399,186 @@ struct FmhaFwdKernel
         return kargs;
     }
 
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = !kIsGroupMode>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              ck_tile::index_t seqlen_q,
+              ck_tile::index_t seqlen_k,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_q,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t batch_stride_bias,
+              ck_tile::index_t batch_stride_randval,
+              ck_tile::index_t batch_stride_lse,
+              ck_tile::index_t batch_stride_o,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
+    {
+        MakeKargs(q_ptr,
+                  k_ptr,
+                  v_ptr,
+                  bias_ptr,
+                  rand_val_ptr,
+                  lse_ptr,
+                  o_ptr,
+                  seqlen_q,
+                  seqlen_k,
+                  hdim_q,
+                  hdim_v,
+                  num_head_q,
+                  nhead_ratio_qk,
+                  scale_s,
+                  scale_p,
+                  scale_o,
+                  stride_q,
+                  stride_k,
+                  stride_v,
+                  stride_bias,
+                  stride_randval,
+                  stride_o,
+                  nhead_stride_q,
+                  nhead_stride_k,
+                  nhead_stride_v,
+                  nhead_stride_bias,
+                  nhead_stride_randval,
+                  nhead_stride_lse,
+                  nhead_stride_o,
+                  batch_stride_q,
+                  batch_stride_k,
+                  batch_stride_v,
+                  batch_stride_bias,
+                  batch_stride_randval,
+                  batch_stride_lse,
+                  batch_stride_o,
+                  window_size_left,
+                  window_size_right,
+                  mask_type,
+                  p_drop,
+                  s_randval,
+                  std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = !kIsGroupMode>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              ck_tile::index_t seqlen_q,
+              ck_tile::index_t seqlen_k,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_q,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t batch_stride_bias,
+              ck_tile::index_t batch_stride_randval,
+              ck_tile::index_t batch_stride_lse,
+              ck_tile::index_t batch_stride_o,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              const std::tuple<void*, void*>& drop_seed_offset)
+    {
+        MakeKargs(q_ptr,
+                  k_ptr,
+                  v_ptr,
+                  bias_ptr,
+                  rand_val_ptr,
+                  lse_ptr,
+                  o_ptr,
+                  seqlen_q,
+                  seqlen_k,
+                  hdim_q,
+                  hdim_v,
+                  num_head_q,
+                  nhead_ratio_qk,
+                  scale_s,
+                  scale_p,
+                  scale_o,
+                  stride_q,
+                  stride_k,
+                  stride_v,
+                  stride_bias,
+                  stride_randval,
+                  stride_o,
+                  nhead_stride_q,
+                  nhead_stride_k,
+                  nhead_stride_v,
+                  nhead_stride_bias,
+                  nhead_stride_randval,
+                  nhead_stride_lse,
+                  nhead_stride_o,
+                  batch_stride_q,
+                  batch_stride_k,
+                  batch_stride_v,
+                  batch_stride_bias,
+                  batch_stride_randval,
+                  batch_stride_lse,
+                  batch_stride_o,
+                  window_size_left,
+                  window_size_right,
+                  mask_type,
+                  p_drop,
+                  s_randval,
+                  std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
     template <bool Cond = kIsGroupMode>
     __host__ static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
@@ -522,6 +702,164 @@ struct FmhaFwdKernel
         return kargs;
     }
 
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = kIsGroupMode>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              const void* seqstart_q_ptr,
+              const void* seqstart_k_ptr,
+              const void* seqlen_k_ptr,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
+    {
+        return MakeKargs(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            rand_val_ptr,
+            lse_ptr,
+            o_ptr,
+            seqstart_q_ptr,
+            seqstart_k_ptr,
+            seqlen_k_ptr,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale_s,
+            scale_p,
+            scale_o,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_o,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_lse,
+            nhead_stride_o,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            s_randval,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
+    // std::variant can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = kIsGroupMode>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              const void* seqstart_q_ptr,
+              const void* seqstart_k_ptr,
+              const void* seqlen_k_ptr,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              const std::tuple<void*, void*>& drop_seed_offset)
+    {
+        return MakeKargs(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            rand_val_ptr,
+            lse_ptr,
+            o_ptr,
+            seqstart_q_ptr,
+            seqstart_k_ptr,
+            seqlen_k_ptr,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale_s,
+            scale_p,
+            scale_o,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_o,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_lse,
+            nhead_stride_o,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            s_randval,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
     __host__ static constexpr auto GridSize(ck_tile::index_t batch_size_,
                                             ck_tile::index_t nhead_,
                                             ck_tile::index_t seqlen_q_,
-- 
GitLab


From a420b3b34d2ad3e897aec824288182cf1e442dd6 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:30:12 -0800
Subject: [PATCH 072/153] add Andriy to the code owners (#1687)

---
 .github/CODEOWNERS | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5340be274..d7a6b1778 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
+* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
 # Documentation files
-docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
-*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
-*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
-.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
+docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
+*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
+*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
+.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk
+library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
-- 
GitLab


From 19d4b790399e479abd66d6555265fd7cd6389931 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 22 Nov 2024 17:16:08 -0800
Subject: [PATCH 073/153] add --squash flag when building dockers (#1686)

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b79b2045b..2f790d8e5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -132,7 +132,7 @@ def buildDocker(install_prefix){
     checkout scm
     def image_name = getDockerImageName()
     echo "Building Docker for ${image_name}"
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
+    def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
     if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
         dockerArgs = dockerArgs + " --no-cache "
     }
-- 
GitLab


From ce2bdf42a9c7d78e60d16cfb00581c83a0bfc49c Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Mon, 25 Nov 2024 12:31:38 +0800
Subject: [PATCH 074/153] Change in fwd-splitkv kernel to support num_splits=1
 case (#1690)

* Change in fwd-splitkv kernel to support num_splits=1 case

* Update in codegen fwd-splitkv to make num_splits > 1 cases pass

* Specify instance traits in dispatch

* Fix link error for fp8 kernels

---------

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   | 45 +++++++++++--------
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   | 19 +++++---
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp |  3 +-
 .../ops/fmha/pipeline/tile_fmha_traits.hpp    |  2 +-
 4 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index d1da95156..1c40cf6f3 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -247,12 +247,22 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 }}
 """
 
-FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
+FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
-                using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
-
-                return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
+                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                if (t.has_lse) {{
+                    if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{
+                        return -1;
+                    }} else {{
+                        using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, true, {F_squant}, {F_spad}, {F_dvpad}>;
+
+                        return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
+                    }}
+                }} else {{
+                    using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, false, {F_squant}, {F_spad}, {F_dvpad}>;
+
+                    return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
+                }}
             }}
 """
 
@@ -614,27 +624,26 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
         squant = 't' if dtype == 'fp8' else 'f'
         pipelines = []
         if dtype in ['fp16', 'bf16']:
-            for mask, bias, lse, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
+            for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
                 # TODO: use async pipeline when compiler is more stable 
                 if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
                 # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                 else:
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                     if receipt == 1:
-                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, lse, squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
-            # no need lse/paged-kv kernels
             for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', squant, 'f', mask))
+                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask))
         else:
             assert False
         return pipelines
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 98a4329d7..3c4e02d08 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -35,6 +35,7 @@ struct FmhaFwdSplitKVKernel
     using LSEDataType  = ck_tile::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
     using SaccDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::SaccDataType>;
     using OaccDataType = remove_cvref_t<typename FmhaPipeline::OaccDataType>;
+    using ODataType    = remove_cvref_t<typename FmhaPipeline::ODataType>;
 
     using VLayout = ck_tile::remove_cvref_t<typename FmhaPipeline::VLayout>;
 
@@ -234,8 +235,10 @@ struct FmhaFwdSplitKVKernel
               const void* k_ptr,
               const void* v_ptr,
               const void* bias_ptr,
-              void* lse_acc_ptr,
-              void* o_acc_ptr,
+              void* lse_acc_ptr, /* workspace for lse accumulation when num_splits > 1, otherwise
+                                    final lse */
+              void* o_acc_ptr, /* workspace for o accumulation when num_splits > 1, otherwise final
+                                  o */
               ck_tile::index_t batch,
               ck_tile::index_t seqlen_q,
               ck_tile::index_t seqlen_k, // only used if 'seqlen_k_ptr' is not specified
@@ -356,8 +359,10 @@ struct FmhaFwdSplitKVKernel
               const void* k_ptr,
               const void* v_ptr,
               const void* bias_ptr,
-              void* lse_acc_ptr,
-              void* o_acc_ptr,
+              void* lse_acc_ptr, /* workspace for lse accumulation when num_splits > 1, otherwise
+                                    final lse */
+              void* o_acc_ptr, /* workspace for o accumulation when num_splits > 1, otherwise final
+                                  o */
               ck_tile::index_t batch,
               const void* seqstart_q_ptr,
               const void* seqstart_k_ptr,
@@ -591,9 +596,9 @@ struct FmhaFwdSplitKVKernel
             static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
             batch_offset_v;
 
-        OaccDataType* o_acc_ptr = reinterpret_cast<OaccDataType*>(kargs.o_acc_ptr) +
-                                  static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o_acc +
-                                  batch_offset_o_acc + i_split * kargs.split_stride_o_acc;
+        ODataType* o_acc_ptr = reinterpret_cast<ODataType*>(kargs.o_acc_ptr) +
+                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o_acc +
+                               batch_offset_o_acc + i_split * kargs.split_stride_o_acc;
 
         // Q/K/V DRAM and DRAM window
         const auto q_dram = [&]() {
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 71c3bd171..4e8d8694d 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -25,6 +25,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
     using LSEDataType         = remove_cvref_t<typename Problem::LSEDataType>;
     using PDataType           = remove_cvref_t<typename Problem::PDataType>;
     using OaccDataType        = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType           = remove_cvref_t<typename Problem::ODataType>;
     using FmhaMask            = remove_cvref_t<typename Problem::FmhaMask>;
 
     using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
@@ -48,7 +49,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
     static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
-    static constexpr bool kStoreLSE        = true; // always store LSE (acc)
+    static constexpr bool kStoreLSE        = Problem::kStoreLSE;
     static constexpr bool kIsPagedKV       = Problem::kIsPagedKV;
     static constexpr bool kHasUnevenSplits = Problem::kHasUnevenSplits;
 
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
index e3187042d..d7bf8ea7e 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
@@ -39,7 +39,7 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kPadHeadDimV_ /* paddding for hdim_v */,
           BlockAttentionBiasEnum BiasEnum_,
           bool kHasBiasGrad_,
-          bool kStoreLSE_,
+          bool kStoreLSE_, /* set to true if either num_splits > 1 or fwd training is running */
           bool kDoFp8StaticQuant_,
           bool kIsPagedKV_,
           bool kHasUnevenSplits_,
-- 
GitLab


From 36c7ce4e0eef86df186f8d796d7e177b8b13df92 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Mon, 25 Nov 2024 13:12:35 +0800
Subject: [PATCH 075/153] [CK_TILE]Moe update index (#1672)

* update MOCK_ID for moe-sorting

* add moe-smoothquant

* update a comment

* fix format

* hot fix

* update topk in overflow case

* update comments

* update bf16 cvt

---------

Co-authored-by: valarLip <340077269@qq.com>
---
 .../ck_tile/14_moe_smoothquant/CMakeLists.txt |  25 ++
 example/ck_tile/14_moe_smoothquant/README.md  |  15 +
 .../moe_smoothquant_bf16_n1024_instance.cpp   |  22 ++
 .../moe_smoothquant_bf16_n1536_instance.cpp   |  13 +
 .../moe_smoothquant_bf16_n2048_instance.cpp   |  14 +
 .../moe_smoothquant_bf16_n256_instance.cpp    |  12 +
 .../moe_smoothquant_bf16_n3072_instance.cpp   |  14 +
 .../moe_smoothquant_bf16_n4096_instance.cpp   |  14 +
 ...moe_smoothquant_bf16_n4096_tp_instance.cpp |  14 +
 .../moe_smoothquant_bf16_n512_instance.cpp    |  13 +
 ...moe_smoothquant_bf16_n64_n128_instance.cpp |  12 +
 .../moe_smoothquant_bf16_n768_instance.cpp    |  12 +
 .../moe_smoothquant_fp16_n1024_instance.cpp   |  22 ++
 .../moe_smoothquant_fp16_n1536_instance.cpp   |  13 +
 .../moe_smoothquant_fp16_n2048_instance.cpp   |  14 +
 .../moe_smoothquant_fp16_n256_instance.cpp    |  12 +
 .../moe_smoothquant_fp16_n3072_instance.cpp   |  14 +
 .../moe_smoothquant_fp16_n4096_instance.cpp   |  14 +
 ...moe_smoothquant_fp16_n4096_tp_instance.cpp |  14 +
 .../moe_smoothquant_fp16_n512_instance.cpp    |  13 +
 ...moe_smoothquant_fp16_n64_n128_instance.cpp |  12 +
 .../moe_smoothquant_fp16_n768_instance.cpp    |  12 +
 .../instances/moe_smoothquant_fwd_api.cpp     | 145 ++++++++++
 .../moe_smoothquant_instance_common.hpp       |  62 ++++
 .../14_moe_smoothquant/misc/moe-sm.png        | Bin 0 -> 206879 bytes
 .../14_moe_smoothquant/moe_smoothquant.cpp    | 264 ++++++++++++++++++
 .../14_moe_smoothquant/moe_smoothquant.hpp    | 114 ++++++++
 .../14_moe_smoothquant/script/perf_test.sh    |  37 +++
 .../14_moe_smoothquant/script/smoke_test.sh   |  30 ++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/core/config.hpp               |   5 +
 include/ck_tile/core/numeric/bfloat16.hpp     |  36 +++
 .../host/reference/reference_moe_sorting.hpp  |  29 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   |  83 +++++-
 include/ck_tile/ops/smoothquant.hpp           |   1 +
 .../kernel/moe_smoothquant_kernel.hpp         | 205 ++++++++++++++
 36 files changed, 1321 insertions(+), 11 deletions(-)
 create mode 100644 example/ck_tile/14_moe_smoothquant/CMakeLists.txt
 create mode 100644 example/ck_tile/14_moe_smoothquant/README.md
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/misc/moe-sm.png
 create mode 100644 example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
 create mode 100644 example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
 create mode 100755 example/ck_tile/14_moe_smoothquant/script/perf_test.sh
 create mode 100755 example/ck_tile/14_moe_smoothquant/script/smoke_test.sh
 create mode 100644 include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp

diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
new file mode 100644
index 000000000..12224a39a
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
@@ -0,0 +1,25 @@
+function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
+    message("adding ${TARGET_NAME}")
+    # not using add_example_executable() to add target, since we don't want this to have
+    # to be included in "make all/install/check"
+    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
+    target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+    foreach(source IN LISTS ARGN)
+        list(APPEND INSTANCE_SRCS ${source})
+    endforeach()
+
+    target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS})
+
+    set(COMPILE_OPTIONS)
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+    # list(APPEND COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+
+    target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS})
+endfunction(add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
+
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+
+add_moe_smoothquant_example(tile_example_moe_smoothquant moe_smoothquant.cpp ${INSTANCE_SRCS})
+
diff --git a/example/ck_tile/14_moe_smoothquant/README.md b/example/ck_tile/14_moe_smoothquant/README.md
new file mode 100644
index 000000000..599b4c348
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/README.md
@@ -0,0 +1,15 @@
+# moe-smoothquant
+
+This folder contains example for moe-smoothquant using ck_tile tile-programming implementation.
+![](misc/moe-sm.png)
+
+Unlike standard smoothquant op, the input scale is from different expert `[expert, hidden]`, we need reuse the `topk-id` from previous `topk-softmax` and select the corresponding `expert` from current topk, and expand the output/per-token-scale by `topk`
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_moe_smoothquant -j
+```
+This will result in an executable `build/bin/tile_example_moe_smoothquant`
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
new file mode 100644
index 000000000..f43626147
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm   tn  vn   pd   2p
+#if 0
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
+#endif
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp
new file mode 100644
index 000000000..e380520fc
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 4,  64, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 2, 128, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 1, 256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 6, 1, 256, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp
new file mode 100644
index 000000000..4d536cd61
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn   vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 1, 256, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 8, 1, 256, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp
new file mode 100644
index 000000000..b38a4733a
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp
new file mode 100644
index 000000000..c5c170aef
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
new file mode 100644
index 000000000..0e48a1b69
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..4af42c6c8
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
new file mode 100644
index 000000000..ea611a183
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 1, 4, 64, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 2, 4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 4, 4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 8, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
new file mode 100644
index 000000000..a6209820e
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 1,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
new file mode 100644
index 000000000..f569dedf3
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  3, 4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  6, 4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 12, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
new file mode 100644
index 000000000..3793adb5c
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
@@ -0,0 +1,22 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm    tn  vn   pd    2p
+#if 0
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
+#endif
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
new file mode 100644
index 000000000..4bf9cb1a4
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm   tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 4,   64, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 1,  256, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 6, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
new file mode 100644
index 000000000..eb0d0fe10
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 1, 1,  256, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 2, 1,  256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 4, 1,  256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 8, 1,  256, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
new file mode 100644
index 000000000..36bc0de15
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
new file mode 100644
index 000000000..fa6f53b2d
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,true, false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
new file mode 100644
index 000000000..9b7462ab9
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn    pd     2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
new file mode 100644
index 000000000..8911bc229
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
@@ -0,0 +1,14 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
new file mode 100644
index 000000000..07783ac16
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
@@ -0,0 +1,13 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4, 64, 8,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4, 64, 4,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4, 64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
new file mode 100644
index 000000000..a5ab56a76
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd      2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 1,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
new file mode 100644
index 000000000..4272cbafc
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
@@ -0,0 +1,12 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd       2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  3,  4,  64, 4,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  6,  4,  64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 12,  4,  64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
new file mode 100644
index 000000000..a65d3fde6
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "moe_smoothquant.hpp"
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = moe_smoothquant_traits_<DataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kTwoPass_>;
+
+template <typename data_type>
+float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/,
+                               moe_smoothquant_args a,
+                               const ck_tile::stream_config& s)
+{
+    float r = -1;
+    // clang-format off
+    //                                         rm  rn  tm  tn  vn   pd    2p
+    if(a.hidden_size <= 64) {
+            r = moe_smoothquant_<trait_<data_type, 1,  1,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 128) {
+        if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type, 1,  1,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type, 1,  2,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 256) {
+        if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 1,  4,  64, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 4,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 512) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 1,  4,  64, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2,  4,  64, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 4,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 8,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 768) {
+        if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 3,  4,  64, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 6,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1,12,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 1024) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 1, 2,  128, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2, 2,  128, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 4, 2,  128, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 4, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 1536) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 3, 4,   64, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 3, 2,  128, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 3, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 6, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 2048) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 1, 1,  256, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2, 1,  256, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 4, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 8, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 3072) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 3, 1,  128, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 3, 1,  256, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 6, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 3, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 4096) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2, 1,  256, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 4, 1,  256, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2, 1, 1024, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 4, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size > 4096) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2, 1,  256, 8,  true, true>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 4, 1,  256, 4,  true, true>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<data_type,  1, 2, 1, 1024, 2,  true, true>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<data_type,  1, 4, 1, 1024, 1,  true, true>>(s, a);
+    }
+    return r;
+    // clang-format on
+}
+
+float moe_smoothquant(moe_smoothquant_traits t,
+                      moe_smoothquant_args a,
+                      const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp16") == 0)
+    {
+        return moe_smoothquant_dispatch<ck_tile::fp16_t>(t, a, s);
+    }
+    else if(t.data_type.compare("bf16") == 0)
+    {
+        return moe_smoothquant_dispatch<ck_tile::bf16_t>(t, a, s);
+    }
+    else
+        throw std::runtime_error("Without supported instances!");
+}
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
new file mode 100644
index 000000000..88d300091
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "moe_smoothquant.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = moe_smoothquant_args;
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = moe_smoothquant_traits_<DataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kTwoPass_>;
+
+template <typename Traits_>
+float moe_smoothquant_(const S& s, A a)
+{
+    using DataType = typename Traits_::DataType;
+
+    using PipelineProblem = ck_tile::SmoothquantPipelineProblem<
+        typename MoeSmoothquantTypeConfig<DataType>::XDataType,
+        typename MoeSmoothquantTypeConfig<DataType>::XScaleDataType,
+        typename MoeSmoothquantTypeConfig<DataType>::ComputeDataType,
+        typename MoeSmoothquantTypeConfig<DataType>::YScaleDataType,
+        typename MoeSmoothquantTypeConfig<DataType>::QYDataType,
+        typename Traits_::Shape,
+        Traits_::kPadN,
+        Traits_::kTwoPass>;
+
+    using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::MoeSmoothquant<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png b/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png
new file mode 100644
index 0000000000000000000000000000000000000000..5a40099ef3ce3860ed133e4b150ad4785108f129
GIT binary patch
literal 206879
zcmdSBWmjC$)-{+ABzW);EVx5(*WeDpHMmo_YjAfhoZv2n6&48Y?uEO%Ls#xIp8Iz9
z=s(c)ZJlvy>?3>0oNKNVuB0G|iu48P-Me?F(o$k7@7}@Dyn6=&`3M6&LX~5o_U_%=
zo3xmSs{7J$7TnJnsb&8}o4F>4O0JBK46R*ZC?9rc&*~1hE;@JS2Q;^)%~KV^h#$GZ
zD%vH&t1dp|YbL2%;J{TzLhVzZ*Qa~%;4yHJB7J)?ew|_12)e}o{=D(+KLP&Nc=zt!
z(ck^Uf6XpD*uMn-wcd4n{R;U%);Zk!3;6$<{>&f4xc+Os^A-_qd;ec^9T{d5^}p6T
zgHQjDC4c?@c*%$;cY4}g)9et%_1(n0tPSHdFr`7)7QTF3Wgg}Zwsab@D2t7jHAarP
zx6UFRot89)8F?<CbqH3a<3bFMmzT-FTo9c%<faijLSAfOdO{kBF$Bnnn&Ogz#1Py!
zEK+E!zm2DojUCuBd|yOXLc6OLkTQ2xG05;u(*e<vnku@#cri?u!;%x7J(B06M{l8b
zfi;Nzh0dUu4T!+>D=f6mkRNAQY}<mR9&WHE4K>HU?$hVk@tWF)FO0^g7WNoHL5V>C
zGK?oOjRJaxUr~z9z=01vtQT;+JiVB~;`WIN&EZaXC&5!Tu&QbwJ(^RKC;n=4D(4MP
zvj&TOy-;Bl92X<T++GqT`}vud$j3&kXs(#96oq3Hy8L@|r$j_JDooRoQdPKKHZPAl
zl*r32w)jaD%H*P=ku$h(<0_SaQe=J&`F?t&YBKMNAkisRSUPFxjo{f-CY6BTu6wwu
ziK$6AUSCp=t~Q2y6)X&;n>hb6+IkH<UP1-TDQ%OshzTV*6T?}-vLf9=Z)=QkG1$js
zUCG*A)&M3{NsNQ9EZGRo0g~G#9>HoqT*>sM!%M9qd>H3*Tc<`gXntpO*9;JX9)@sF
zf|LF2<I_uuEU5HLf9xZbk}1KO(~@w-Pww+ahnS6O{J_M#t6^6b|2AR`M>&t9A|B@M
zPAls?5}o6*U-ZKQ)_KZP2U%YA-X=jb^@k=0<=1ZwctvQQDgm7RHpJJ;-wX)en|U(}
z=4Q|xOH;}A>aFh|a`{>RbBYdn=q@slqp@&3r!?DqIDV_CNKSGqZr9MliruBpHi>G_
z3`=)c@eb6+6DVmbM*O3BOJV$cLlFf&%4oK&W1#&y2G`8xQ2VX*RY^`i6^lhmWMns>
z71A!(o}f}de<!);-1%vIHB(+N+(9t|oY8tqXKU<ybAl~?(yknwR2;IjT%_uTg&{hX
zEJPHWH-b+cseGLONJ*{6WafNx<X!o+Tbvdan=T?5d+VgDT>g79S((S`ZL78gDXT?N
zn@FH<+@U4>qg4b30Bw)hYg8;ZO`)SPG^6#4UOL7614*R9$gELN%SXN8#bfJZtHba)
z2T3u{fH7UzjM5W2@FHY*e@GWp_0TeZhqaxCZ}2<)(GIs*r<MGKqqSoyFI7<mdzb86
z6u6S6syH^>$c$G%Oz4FSk8LXwMQ2FCy{TQ2CH$iQZ;JWWSo;VYUVv#ZnW#X?uhN_E
zRn#`Vnc)}m2J0wCKcyiD>pPaB=(|C`M^O|wb;N7-gQN*`tI`(kOZkpOmX3jD<TVwl
za@l|b_2QtB{YS>}dW0M=H7T6De9@=WN)u|%R>z+DiVY2{Yje<`71H8cIBu2gYGrhl
z5MIvOrIItLT2Kb>@*;iL?9~2HVzuLPR%}1boG#9vx+8*C6nb-WXBW+svdi1OwB|S~
zD<}V_+`2{y4Z%T`2cqc75*jDG)Rb)HrtRY|z0p-8N(T|PD0@}NZ25)8@ahf=Q(cSu
zLkr&W(FK!>4`$5vL~JDyFT3zeMRQkN0)=s&z)F*h)JDOzvZTpmOBvhg>FVqv8kX#q
zkMALK)Ie}N72W&!)p$fLD<=$#B5%aSX{N&HD%Jgcf41sh20K7-m*pHd?OsIE+2xDS
zz7SDl$5U&UH>I)ppYdaZv3u>56UT#_ymP`R024Z;4J<luifaOXXO~~cC%<d0l$MYC
zn`9qnCoajb9teHtJ!acgoz!4^w51+O2#3<#6STf$ux!YXQElh3R&UWEb2|K=jlU+P
zJ|jeIOvxnH=7OT5JA!PbHlzw(DGIB|<4ULuq5`;=@L$fhYP1ixrr#rad>5NbTc#C-
z;i6Q=?$JgH`w*>nn8N<5>T{^(yX;h5F(Np*TUr7Z@|tx{`mE>0&jje26(b5E4v5+d
zFw|-=vi`8AK&&o-_0_h@fqTi@W4E-_m~mw(m6doyx~S}7Q3H;*Pc@N)7c_$nu28?g
z*g1)&5*|7v{A@_O33PyXGYWvpdib%%|FU>w;F{}Gmea7=8qst&l%YAvi%m?)g?C9Y
z_GDO2CLE=8m{v5+alNW$l}&)S3GT=gsn^pZzvV{FQZfCyU^l&!Y)|=NMpi;oR9b)S
z?0->w=#4tN<Jdh*5xj7HRgz$&-8yi~=@1{+r}zp_ZG2WlD_%ryVbM1)aeZ#SrRdJy
zbKRcd{*}sLhzl!#(VYCF)sCI{=6uR&>Lz-BMx;LzlQn7lLU~0g>=uB!+C9V6@G%@$
z4Wo*cwZxQG0}JndswN82O=VOSV}EmM%El2uqJx!szBv&k?e>E)m<pYG3E2zd?B@@>
zPNP0Qt>q`yroF;@(Xx+}Hx;2x)!}=WT_wi`zY-B)sO5C^*oPw!y)}&YcA5G>4^}E7
z1y~Njsu*;KGGG<dtf;r{&q;A>PmcLWdb&mBv>}G`vy6+yQ4Y2-1Inq(Pu(&qQXHV-
zaO>D9fUltN-o|o~sVXD?u(Dfc5;Tu`6j|dJ#*~xwsX7bq_3@P}?W9!AsPPi^>=~Mk
z=!mcc`q<#kFxat+>UI?M7Y4ynif`_cp<IEoH40#PxgdxE9b!KjhCVDIvtg*RPtT1S
z$@=WwUHL2gb*2=c;J)w;(UwY!oPex4nx0_7!1yu4Yowtn_DSqo!a<++=c17IdsY9Z
zCl6r*(b@HL0_oI59J1-o)4a0gge(gHIi7R+bl>H~iP7^cwi8A(^2)EaWysMBe{Q$&
zXPeE{r1nV<%qd-1W?8{9x~$(YhwWc!(8&RmXIumU;AVGsi{dG=DR=2HzqAepepxeK
znBF?dzZ64(P032;4Cu@GbqEHs2^WYsqnvIj*{uKdo8RtxmX=YR@wO?T__(HJuZ+W}
z)=TNWH2UC1_aBoVhSRMs^j|5ASE5ZuY$R0z{J9C%=N1<4ul6_6+Qz7jSqw=%nkOse
z^spEq2>DJER|tI>Cr2UmU|{nt`=q`PwVHDIIWEA{fZ)zfzmJ?>PGgQ2qkKoZO<pp0
zLFgzoO>@vW@-F3E^fK4DZ-%CE#W~Wga;tVx=86$AqS<LQ?8~~k*}jwB(YrjN|4q-^
z7M-1CaZ~Te-Pzla5whxIdz)r=!Bope28zQ4q*%{c>{&XSR%zqezuv6E1~2x}H0A-1
zP5cLO&*(x6zW*4J_^FAmqbwK8+83E_<_nEEho+>AkZjlP<r9I3@Mu}UQN4qCKC<Nn
zZlwEAz>a(+iK~Evu|-i!A5Dl$!*&5(w`%A@3;MJf_SEL+f!@;BL`F#c_bD@+xlLPI
zs__Qr@*gYr0H<ZLdW38*y?+tE`|&}&flYtXHfrEYAIKd@D;@RqVrx`yDb3<SkR*r1
zo*N!3y}D@MDT`m{-=Q12CLbjj<BlAzVH=Bm)|q+t$h$-amqH?pV37N?&W9EF`!?k}
zISwLP^MTIBFa>i`^YY;9miftw6c%4a(vNatpMp`2Cg0NXIUF=ZQ$cZ|K=i!Wt`{^5
zXT~xvB@XGI!D6{u65sI+A{AgsV}NJh0Aaa>(Ly-60J3NbB!c#zGzbN70b3PuUy<l#
z$F!AszEPdpk*Ow-e0rb!ArgNWE(Y11TWbOF&rX0L{+8qitT?ee3{ChSYz>p`!jV$M
zW6U<ppQC0~DoUw|5LgyZypQW`a9Z2`Q^rRii7U3nyHr=}<X3>C%#V4(3D$ahUI-~L
zJ4!`<x1KZL<d~nPgr2&hW|<a!LEqZR*yH)ac0x^qPUDT^6(1dL*6vJ+laPm<HW~;Y
zu8vrm9BzKcUU(~=f@_P!wHl2r@Hwdo3!7`Qn9JZhLNQ&??r?xvIgb>fJUbVT#laK4
zObCmReLO+Pp;@>647(GLG!ui(yW&uswdJ8^JsKAjI_aw(G-X&tdS6TWt?#d&Q`!?K
z%#jZ)W0I2+aOAHKW+$AbT4fAfo!KMS87PN=VgAkWWE#L|f8v3Yzio$TeO|)3)}-t|
z3ITI$cz^V}%WddcIXWKgGRW4BCnTN!NI!_3Z1(w~7-_*<s+1*g&THtAyw}SvViIn^
zvy^e#D!;+QgA7QUJ=dn1=ImJ->{*Yt3Q_HD%tlJf>8(3>WQg|R)PJr6_9<9s$cYWi
zi|uAa=8}X&Xw>2Xtuz!>EP7aHgVix^WTi}D5;3}+8u(K<rl$!_JM$|S{shgLm6n1n
zWeg2H&fSt0((g|gzKaVCs<W6O6ijgILnPKIIP~QQLDx4)-Q}e+xA{#Nb?kCUnZC=Y
z2ye@sW<Oa*W}-j6`B&q8*}%Fw4_Le(yHaW(`<de^<9)!tfaIY!-p@a-9{3OVVUtGW
zx|f%jueSNvisukxLCe$o_-|oudDHeV!ts6Z|KMQmM-__RC*{2<HNwCDd&;{D_*dlr
z;5cmY|7;n564+0G_0D(yK%ubm|G>z5k^k3e>qjEx&QURpHby_+-TQ8snQm?H)4olA
z_dyhH3wVS1584#RfP%$)Pl}VU!Il(-G(UOlci#Oq*yN6>9<TbZ|KI1HMGS+}1CYZN
z(c#{$!K(L1tWXpdB|XjIv!I1>og2IhcY9)+9Iu=C9LeDn4-dM%pX2!lyOD~+!kY@i
zMl_(0LOgkU)xqNre$Yqlwt#%30%Nf%=;fR<#>f4Q=r2Wtv&!hT;+A?X(R$X|`w0Cn
zO(q#RBsCE!gU~&k)L2>B%`3H1OYXAL3HBWcY8aQ&%!A=y0!j>IDN#R0R)(TUR_xKX
zMEHIdQt1Tmkxw!C-#{sJUz$epKLl(Uk;Z=grdmUX^Syqgxc_-$lEsD*eCC`D2VJHM
zTU?@zgFUe<R|=;1bK~!*erjPo|9pi1tibY@z(e26?)8~SGLg+G?tbuZ_vEU!8r@0N
ztTJGFotx>a$Fib<fX@+zjN+uT<z7PKV*@BDES=18?fG@%BJB5dg^(-Z=tv?PJ6oZ_
z(-+frXY7~CtDE+~zu*K)#i3ATS?DE*hePl5-T8;}4)=QO@AMnASogl9uz%Oa7f8Oh
zJy4uIl>c_1ccq*C@~g`Qd%9R{)$jyp`AP+WAwKgpxt?)(0`U@we6p%+w2Y2Od}_c$
zy0>3@#*-5JExaaCK5Hm*19ZeaAB+oIkhXR^Y8Krk(%~K(5|v;Rk}9}c)|)%{7M$@o
zWAFFjKU<OJAF*H%>-HB2{MwpSA=ar9b?GFS_`Yio2JigR&fE;4uuZ_y59I<kZhKZz
zvAA03D=wlv;rt;@Y(j5J^np1o4nc*Y@TGpSNsL|#Ih!KWcH+K7sj7AHwdH)39V#lE
z#*rjBPNH?JZrj~8WyqFqM>aVx2U~Mfm}2%vv<KrS<pBS)9To4z{SKAdI78o{e#~0`
z70<_|(f)C*%u3zXnd&SZ@4gi8f0T!R{y&n$TL6LqkQkO4)@hBQxvIQ^dq)M`+9`E=
zIEtcQ1Yh56MsIy?y{Hr*blxA*E0r2JJo`1!t`T#h(-MUsK1mT<Jb74Dv3&KFOd_Al
zDa25U_@e)ucy`#+>UELm<s?P(lyZ;ithkx<2H`h*;tAK=KKW-<k^TA3Xutr3FMMtH
zcO1%Au+6Y)A4F01Q?q8*5u4XbeKRbv!nL*9+dPx}4&9ijOOoA16&4;zP(d|Uh5Bn#
z#|hjs7ZH*smd=a0bcgiSSj-#w+Z&3p)kEm<07A+)m@LLG(RR;eS+V+UMHa1!!tJ%I
zI#vzWdtZLP`$zpKVMECx|LXpyO~wn5GXdfzs&}V}fwcVU-J>KXU|Yee^7c+1fkZGC
z<6JtBi*|M?2KAZx)~cY8##uwxW&OBx{nM||tLv-q36!m$rY$DUfITDE5S|5hei&ni
z3=z8CebYEf`(Kiuq~O*pYs9T2Fli`d^O6`+vG+R0zW3d}I@=Yo%GqPYfoek*xI<X`
zUW%X0VKYo8=#s+5_ws}+U?WL+wn#B%0BA3M*a4Hwl3rb=_tj~DjYR$sLmF)Wh?@(U
zNF+C*i*{GfRt*0;hI6KZBa8jD5DYnDw&1RTw3b5Kk9tQ*rsb5xYGJrnot#_LvSi6n
z;Q{Ka?J~r27~=$$<S!@JzC?WN6?t<?x_S9h{KVmx&*LKw59XR*C?}qn!;wPzze@%b
zeNG~1&|)fn#N1_#?7T2c;9~+f_IyUeLyAQD)Q>u>jD~o+hm$f#!vC@u=wy03#$v?s
zWlQFBFsa9WiQ`UeqcIIx()~%EBnc*49aSYHD;`*_VvQHGuUsv|Q>orhR6p*J{XfzE
zU#wm~dcTQ^&Ev~|AyGx6QTjBl2mFyM%+*!oi$lyGJ32;h&t%u>arwzwc5<?uG}0|c
ztVx2TDPcgA+<=mF@7l#pOQ1t&%EEbCY%YsgL*ED6mz@1l%@N*FDqMJdSGB@0xlaC>
z4mqy*)x~%g$aqUbx4RigJ#K{QgpEWT$+x7%QK8;pT1l6}DxJ4lhB`!a$$vXi8H!k%
zj_&~{g?Ca>N$1(xP^!`1;N;N$=nz==y+B`HtCQfx)|wW`RpBKib{*_xIGE0ZYb0G6
z0r{}R-T+EybI)>pG#hx?V@Yp;qtVKW5N!W&7#`&Poyy+lgPo`PY3&iZD48*v)0GRm
zEr9aM!O8B_ptLoX+Htcf76s~K1tma3<<!&Q4mmlsG7bu9UcCCs1D#gWld{`6T;2!?
zE1#2#dQ&e&>xQJ1qL1ysf=dyPDz&_k&3HYst@_6i-DoG6*f*B^V$nQ7&l7zc8D7lY
z2z@(&jQNWj`@Cy`hMP|HPqpEep;fD>k)FxLN{DbK9A_%+I$U94^143zgyPDkt6Az%
z2eICK)QYdmtpe54^>Na7fYlPWVO~1a!aEbs0TO~K$1AB!u4cWh9D-R@YuX0O=$O`v
z{l(~-l7+uBy1|&VEd|jP+FIjtpmH#FJ;yQIQYt|nMkGwEHSJPM+G^$>J*7uvx9`*S
zPH>_kl}LqNT-9e5fau!`>Fc9(AbN?CQj(L;9a5fuET>+3)ZU0lmODSBSja8lKgGiT
zYx~4Ja?_p87cK`;&|)H`L>+@(_k$-=t0yl@UwB(_>j|2y%<Vcvsc>HeYo>3fH4LbJ
zf913L0X~chIJv~embvAn^no-MY&al3Voo;E8grvl$E9*!mES*gjiq&zCQo=BhZy6b
z>$Vk49@tlzv-cfRVii{GZ*T(K?~FLrv27jqndjYQ*MYGH5CZI|>kR9q*+^b#d(`<}
z8wToI3!UN!^h-?`>RW^6md!4$aH;+6pVp)pWMJF`@88}u=u$8rtkkNM8?Re?_I}hk
zT@SZ>h`0v%mBV9sf|8Dk(?1~(Ub$s6toiZs8N4>4b3}}sAdASIPUjiFl!ZulPcW5$
zT1}tEGTW~PCN8ws4r9r=B!*wS+s#l9jm#-=5Z%ks^@L4nwy%_1@I$7pX^9@)l)!z7
zo*Y6>SBrl^;6k$a%HWaqpq`UrWvTf|5z-cJZNOeaq~vW~eWMs(7#i+U0Y1Uq-XIQt
z!ZuR0VLYZ656_aej$mZCB#X{`DW{hdLUKrSm~3XVXR2tSvL%Smk7Q%(G^qPh8WK4_
z_#MyI>@$4wRKqLLks4EZ0e-YPnEQNt5gZ$b>GXp4*)HdSKU+bv1-I$V$pp}}iDu!%
z%F^<<-n%Nor4+EBqvH-vX?7PPO#YCnl}V#isxwn@-G6?%zuaH73T%I<DGb&Wb_7Rp
zR?b=`)5&*oMoTzbRS6*r;!r@;D%_1Q|0iKXgUvew^iP1?fs3v1CNZ&7ZZG4lNm>41
zG-kFfc9*%(08_#5F2R7Q&U6x^dZJ}R*2)}nfT`3oZ#`3Y@_<Cys*H9tQK2DhTNyJt
zM1SwI^;%qeLGg#)5^w^2?^?Q|+H+awG!eV{@UPazyY1RD62Is+6TO&L#MEOsttgY(
z%hWp>bR4o`1ihXvB7?9ajCg0NSqcF<g$EB+(!^(KGLQE-0gH}2;*$b8Gq#!_-2hRK
z!`&WP^z12y-6M6!o%>f@%vuTA(KX*~PYYNTT_X7czjnRI<$>Rd#+aAbGCl?lQTZ*6
zk4VH-Fz=bkm`a69C7yPE8n;+p{*+C}ZaGa3-H^S@9!f2rSnD7j!{J}i%=Gz$fdxzp
zX;>RmX*{3TFVu@5e=64Ue2v*kJrff7MVA<MsLm8QXSRz28!A!8t{oj?rBl3gpKS~4
ztv``fvsax++z}S3IQa8(V`i!oPPK}5gan_;>b%DT1UX>uSFfmGg5h?Fl(T>gu2+0q
z#1PhwNZuR+;bgrCjzNZDB5PfL^+J3?RzLMeS!6_Bc3iT_lk<}85yJPeCJu7tv(blN
zCfc<ae{atd3jk{mz9LzWC)&Kc;HlnAtx}3QbK3#99p9$|9QclAw-A=Jhl0F~dM@Y#
z%&RfD?HCg|tF;z;7D<bR9rmZ%kAJ~LE6jt%UzGKLw5my}atgYjJu{2n!d+Yz-uO)k
zXzIAi7lWmX51;$;D@RBm?G@nZ=>B*`V*{b6D4(o^0GZ|nC&V|T!W}yK;T*bDN4hcZ
z7@p<Hyt_}vMlj~hBqHDNN%WNwp+yHQ6mLL_jyq<^qGMs@ziRxwx8$=s6!TN=+3+|#
z&V5JeA8G+t<gjvc(%KGmM~D3Bp?R;eLgkCIi+bXDbTDj5a9fKKX=2&i{HoTrN*e<4
zyO>-0E+YEgfDD)E1*915sGv>MH?a^j>je;qHqt7}PhCE|_z-tfXtqlRP#hkl{AP4J
zF(z=BLjRM6G$y#u>@K4Psno{73kcdmSkTt`c;CFy{sSQ5X13kS9YXF0tl|A+Ny^&P
zTi^+I|AZuUEbTlu)lN-Iu>kX$BuI2z$%t)ravluCoT|4s@yDI2UDf8eqVL}`maGmQ
zU2{zv-?Mg6Eloh{bhXnet}Bj<t>kkyembDH05DSI_jcZ1((dTyDv|d1-O^#~v`J{i
zQWiW)iNPkc?*BdB@Ut6Cq*o?Iu~@bL2@aw0GPet6T5uqwZWfux@@lkQ%VnL?ONeD$
zbTOQ7If4;fgA5rxEchy*al#13GixYngP}AIzvCrOAfKCsCJv>#wH7j>pwZwdpQ1q=
zf3~LZt?lai9z4fM<sdbSR^XdvY>a4!3WuL&Ebc3g@&?0#!(-Quj!~Wx749g((rV`~
zf*~DM@7o-(LXbr7#lvrv%EjyD8l-F{-i#&B!=;!X8u0{A51ylhD_RVna;F*-5}cZH
zcyS1Sy-M&kdsua5dTZxRT?4^XOteHe4{ZjXx3>Bq;Xp0q+BMEBLDwxoPhl%f&`J(D
zcu-9)K$pk0H%6lrfB(<m<X9O%IPmtSY~b%gDKMU`i7*#modH|!WI4|+xsLlT5|Pt+
zqp@axe&74^RMkZZ6DqcqUFzty-9623ZzE|bnKs=(l$CLjwt>#SO&ym|(B$7u%et9G
zGk+307=QkeviJJ7(KYE~hfB=6^@4}57iP6u2+Z&T5tw<Q^4BZTSW(3E=@wdRoQ1MW
ze#a69h32tqzMeKfPZ#gi(P9*@<Q1QXS1r9SZl!t$S!!0%cGrzRI9V^cA@A2(vEP>M
zeLRJRujNRF(5WCL=kBn8g?#z=>coki*;dzNt-$4CvvS;HZddYap4NSK-l3+X>0cWH
zaigc)PPSW<kudMq#-@ypt~b<Gq&@9kn{oKCrdVBg%iM-rdw4mEo3#eMrKh3rdpWJf
zN5TdqH`HsfbQnnU9kDfa@1F2??_K;RGwgrlaHj}oGWqp2?!y@(l0<rKQW)OBa-3Th
zIG~Yf`%P_tf&S}YHsj%Z)jm#6hJb_oPa`W_M_J<7y`xiAu!@$Sfg*u5vWiek>L)FZ
z9K4*2Nxv~qT=V$rm{ZJ&Dqpq24BT^%4d$)>ij{x{CCO=EF-qnB&Tl1UBT^66`;E&N
zb7$;mSErusyqHGZrW`Rv{d(k5&Z(`+>5vrqornmS<N_t(810TLWfK%}wv$7(9G)7V
zY*8-T)ebKVW@U$0)jwVbCHhv3(b<$r@HxFyC|Wfl6xf&U_e&0eGGGVM&#9TW!#O$}
zyhJ^$N7QIHH};~GKhx4}y4qZ5(-&bSoNe{JO!lbgIPQ8RvvpZ4ULOftR0`)9p9{Sb
zGj?{6M!SSKsfx_=Ktf!XB=d~Ig*&@li5~}%@4C%x1*F!4vXY&b8#Yo{EdxY3jHd%6
znKzpSVMdS0&UUE6dh8o+9EM#5qod7P8g-_5jlmp};&}xnGZp-W5|xuv#0qHKW2m~R
z5n}XcnP{ze64MO4-_6ht{WEzkT`qrffVX8hhMv1I+P|pXZOEi;ug1FbxUBlBw&7x4
zL<JC8-=`GeQ?c=NzCF9pRB8_7EY#~^bzWMX?0F++MaRu=$T_<=epzMC;E@f~_VPZW
zNB>{`V?JRUHTLgEG5*So*0oh|$OrbJ_u{eW))!`XYbeNGS!6hfr{Yw)v0+&=c>y6)
z(o()_60<T(+CIk;In+g5z<QU}I9J=UrwqjhUUKt4lh*A=8l@*&uVZwD63gy$xRUe-
z7qmW|F0{Uw`=yWj?dIHEj=1yV=Oa^(t_tQ}!@Q=lFOmKZmW#|PtUwK>7c-`9<_|-A
zuj@)J+I+uDbMP+|21Zb06;VqLukg8E0>im@_XpPcsut3dsU_!^O2*!20q@DZTJFDk
zA3nCD^epLUF+GDMXIbrd9?Bm0Vj)P8qP1mDo+Bhn-x)Cf7Mr<db?7o|Ls%NUojW7+
znY3SK3@|Y~&No++5yta(D35K`qfV^NEkT!E2jm)Ovc00)nP0}h&03>3#fW)1Jei<`
zaE+LwGZO%xtLe{$VQz}$5V@%pbla)fEqM24ZZa?$TBLT}or$9a8rL@<!9vs^#V<}5
zD`&f7gO#x_;<$(d^-bULj`<^wLP^F+=2n+Mu=7M2lJKiu{MJXLwZcEmEo(QLNI6+)
zk?crGH3*yS*EboH5Y|-20_sDy>GuWp0_REEzhZvOBs!UkBC#OF$V;NBA2_70UUk|V
zM82+-U$t@yb#{-B<M1{8dZ`9d=PPszg<=gQOYU;92fz3@{-39w0sa&pjZT7IAyo>W
zBiDSt@K`On%9bubWE!32{LB#uw$Dpo^RZO=tC!NNl@cjftcCYH)rQIV#D}*M*dE9A
zWcNcA$$Kl6=LC+$yZ_FDldVu=DRxm0)XVdm{uq0}Vz}Qf6>X(TCuo%ax{G?BMPX#<
z+R=75GD}yfUR;0|1qHT_TG27Agl^arjC|;b4^aVG+Rv)elBDD2J;s=Jg5jRYN!dy1
zg-Yeq^~e)DR2Y|t#EJ!zFH42u)wm#hcreZv9@7{71_Mz+zwzUE&J@-LJZK=<lo+`5
zF`rn2COB&<w7LYDdkP0y3U5dwIZ6*Sbu&E{h88<rjL**5LK(&RE0*#4S?ov-XrBh@
zp34Of>J;b6(5UZ2uBH6u9#t5YTAfegLaWy+^86oGaJ1i*W6ky9&)E|*g>CljUGSrC
z^=A?LMB;Wm3wLV9I|;%Rjy9+JN(2eY!lukl<D0HV#`5+H7Dn+r1oi8+&~1EaTA9+q
zQe3CTMXfM&`Qy=c0|ayF&9?%i_|XLZ%OlReud<g(WnU8K_Sam4`ptKSU_~VfN<}Me
z)ke)<CSu;m)?2UKQnT8K+wCs#MjKTMj_ORBWMuU+dE&nLth*Zcahtj+ZSIzyRjXLK
zP?LxyHPludJvof;3#GXuKWt~6r?PRLNa{xz-`7q0(52D_D_mB1KOC&bc^6J~wz-Wu
z>hE6<42+VrI9p372u?LJMO#PjFR0RNBdq$cy7cI$YnlK|YfV&1@Fcwgl}MBI>h_K|
zhVL{{N-L&|)<TAoGkU?X<T=zHZLC$4gf<Lsg0no2vGvxaCc?`SQlAh;*K&o`ALip{
zCmSTgjRNW=EL>Iw%=13^UC^Bj^_Zug3Q3QR{IP!_m}FmqpB;LEbSC;M%JjC0gB3{H
zAK%Tw?(k=F$r1mma;S<<&C@s)@*?Ag&0^yif;#wIG{wUTp#?09$Ia;z#$-v=zUCUh
z>aDTG@uZtbl3B<*gscEFjqNvEtfV6VtEa`X&vRUxC?`AI3g52Fsi}_?193|oK3{%&
zI2mC#nto^dr~4l6&|ok)Z<>oZE+Uu{H(|&45y~2b8qMZiJuyQ>lT|1c=i$gb!(qKu
zvB}<ha8P)UHIlRUkWsJk7zY*lQgEs)Zq1Xt)^3F<eWkH9uQzcfX~N_bx$E8o2z39{
zbvu1=>X+&zd+~N`mMDf6>Fa@E7D?$FM8G?gl0Zi<A^tBO9Z~~LaS?BXljTu5(ssTs
z?;rL(j<;RBCe~1TPBR3iW+(Lv=j?bK9?n5vtICuvcg_?{KIBqRvK+&@P`LEWfiqpS
z-QI|S3^-}#u%xoM3O2i)>8rGAw*~WW(W)n2-qT3eEFRwz;?cU-4#-jQ_1IXkiIL;U
z#<15fyIfd7I|ZF$bTy}6&piLpj?qM5vs^S{CVJ2vDw{(gq10vVX8Y+%R!b;zu}zlU
zeHbzR9#Mz)v2T#Vn*Vf@$W45uHR4UB36KfG=lix;&1f~s<4#*IY|4l;dbsr4<)&^y
zsFy&%Nnwq^nUdeLXCg*TAQQ5<GW>gFOxBfgf0{KHDYfu?-ZJqn6E#|%){GLhM0?jv
zOxJ-RG7!s?2#hCa_H?jX&^mLo^T>s^YJY9%eOOBCbHY%#*6NRppyqCsr66^QS$Da_
zys}vGYktbNvIW1`;n~(}OS1OFaUi3fi-eKr3f1k<dEg4!kXij25QsJk^>FbD!aLtD
zl`&NuF1GbUg>U;04uNpK-!5=3q$%<vVHD#i#vx0N%hwy2@%~=LcrgTaq5PRYzrnu{
zWMy*NLp+_;X;iJY7hLF`a&xHzzwg@d{rbDyr_yTUY%srhyI&@TSCEn)%D9A`niONy
zP^s~9_uiJxX;$_wv(^3$@v_ZfRolg89iZfFQe3Ll(gKInz*rl0ro9r7Z(-ukeeuSG
zr3>a((3BFx=Yf+Np2S2h9Z;bg)u71{>x2+Evt6oT%0##M?B*({4Y_}mI!+a85{il-
zw{Nt~`Z5jQFsh;C$z0f1xq|UPmtK_q-j)E(a$c%*!`E4Md@!eh_Zz8zPU+kT;Y^F|
zYMEoQ>*-w2pS*Dgm}=!YYe>zhw~yba(y+G6^M@7NP}DPcpkS`k)}bjezn_4^EEAFZ
z9=$5q{7t*B7xBf`9p|>;<_Q|oMJcrkEB2lZ^?G4Q9bXa4T~}hm#OL+<C0-M~Kndj#
z_SiFiFQ4h!NdAP9{exC-8=;;Lh$@T00mZSpEs<D3Io0!1%0BM=$15i_4}3BC^0|R7
z&VR(RU*)%?CXF$tB^g)Mb5#mruR`xibOsS9As)MZtvz`6-ctp?UlJx?lxPiCf=>Q6
zCUVUY3ovgxnRa;U`teyG6h3WqRxVS2ic7(CSW~|N&Xmw#pU2ncbP_%8onr0V2)it$
zSoTyh{&r0E?Cbl0PHsFYa=e&77I!=Pt%t8KXnVhq56Vnxg`CXRK}yEtCn2O4#GO`W
zl@ogmF`x=H)3Nm!kBITbal#M(<d-xV(5_q8{ML`ZK*Mxp9R<Q>q7Fx4U_@0fK)fRZ
z+uc?5Xvqo8C27qCyGqn~<-B!S7|L0FV8N*4ef$UtTd4o>q;ciQ8Fshy=R{kTmd#^+
zv5-2D`BkvSP`vt%3okthVl;L;8@SX^x&dU=cvRF6gf-UcoP$ZwByipnK2NLy(VyZk
z6IqG?;rQ5)x7~1~nm<40?+twbhQ6(gRmb@ruMyz4ToySsa#rekI^?7P@J?GUxbGk6
zGm6SRc+qVSLyvU6USXFN7Ta-F?)PqOfUAXy@GERlbJS;tzI0qFUYMoiv?iA;DW0vw
zjX#C<0b1>uWvd96i(<mY<?}&Dj|H-Z%&9ll#^!@sBD&ghE8kO7?*P`6IO0@h$E_FK
zwDkRdWagTDXD=U>>jvm2G~!lK?GDZNhwV?5w`-p;D8|~m4#pzqUA$LP@2s7!A=53P
z)j>=|^_wQ=i9zfL$kLq>97DW(E8!6azP$Hlf6F&H>&l1~D$5@ls<mD}ARj_o7!>nT
znd+x}PKR^fw{vEPJfB%A5LnNf?*4EACPaOizQ*c#`I{(GVT8jz7ZyA|Hm8dD5(6_K
zZgW-*AA8dm>TQc9W+ydhk#zj7H#?A<;`j^5L5?f_bdb~iU#lLseUIdLan<Tq7n(d(
zB!gSLW!r-;TN<bAZ3JyjTiV90oEjyxqCy`ZNIXonYU7%`-&*L=bMoqIltGR^o)EU_
zzsb_NK8TFm&w`b#!Zo?u0sXCNLllKOF<P|(l=P{lHPCj=%gTUa;f60zcAR#f%L}9}
z86NzMD6q^i8_O=y_r$<t*gYiNK&bUnrA59f%bnZ4;Pl#e*t5r|q$Jqla%it;t`pK*
zcR6FMaqNzfBg!VH#Z}BW9YRW=I5a>ECi_NCjkoBY+6OX2u_{${0K1-VzRZS>PwG|^
zxH`|L=w-~QmML?(uNGHJjgE;23)RRM#AhAt3~3RK(x@IjEXS;R^xgbCgEY*Y=jhZt
zx@9bD$Gz;@d0-O74*hmY)ZS|4m_MvJ${RNa@Pyr>5rKq!4<>6esKt$G@h=a6x1sIc
z(YI1mILxo0)Fx}h_ap`_LKccjHU5fAnnUWb1nXON!1(g-UQK7v3Yi5;uc;Sw2SHA3
z7=niH9B3zTRD1fZuNnlc7^yam@kqp&7aZ6XtHkijwK}e$tau?=)zf3w!_lOusu8l^
z!ECO!mJj09<}@o+)lzc2PHuto^pl&F8aP5-l76OS!nKd0C@(Qju^c|plXEQ5Nl)Q?
ze4h5>|GlNDh;f!SBiA`-UF>YZUF##zvs4NmC+WcR-M9-l?X{UhSj}?VcRYVKB#v>l
zz;SG&+fu)-&@dIL_AmJzHHYk4L9!yqQ~dMxkL;Bf`;mHa-Q=i4gdm{5EWlP<jTy|$
z-OxWJaEBXvO@9)hUbr_zFb>&1+IIgbs(8)9%F_PQ-PM6wp6&JAJwELr{WiDOY9W*{
zllcqRsfn|6yZtvaHDsUE>%PxeBfF22>Fm$?{pen8@~W)e1Dt@<Rie`+a164!V^Kb%
zk{#0+zW8uwKqY6T0T6w9gr;b(p{jZyli-3-l}`Q88S(TY0y7g224SmBL-Fgo+4R=e
zxq<<`&YL5(PNYZfw}G8#ywi3~HWp3}@Kc>nRd0n)7SYyJ-FKq|XntLt8&54d=qN`Y
zoL$=b@tAv!9HYDFeBpc7np|0!f09b_=moYT8Eof!+|Md5Xw0tA7$dJ2wxq=rjFNS9
ztKne-892-A7RjQI@kcl}f+~K@RaR9?4x<wzypo4=O*LkEKP0($=~ERQI;-a3-XMUi
z{~Ad^${8>cWN$NhoOjT2DD)y0UwP^jSqNmE!K#)G$gafTS8kZW>|{CJ@_h8Vx|z&2
zoxJz~uJ~v-%ggBrS9wI@KYJHs2GsJ}?erRXm~m~6#ADE^b9YEh(Ri~!pZb8NZ%nk+
zMptM&A%+*Lz^_;Iva679_%>5w(uRQ%=9!-JraY-y^@P`poPifj{M?nc_NUl*!mWC2
zfqp@jsK@uatI8jt9^8Cicr^@=5p!xEK3xh(>I^^hc)D}*IAY}`wnd_M7u&ATV|+ZZ
zW#E+iVq;7CUFLIQ9IxT%X(kL0(&pB#YGD+?(%&l$#CQj3hLD1Mpg^-zJ6(sAt~iY2
zPCz{Z^D1czUf$J_`gvnR91cy=A46pskalswk=jibS3~)0^cI8kCjqkxg~1_-#hRuD
zdwM&Kv0+=y(2&|ep2Yc61-W6Bd@`cBO66!InsEzy569lPU?8j0l?Okh`LuFrrZS}N
z&nSAQvrfwE$>yy(>)R#7F`DYx)LYv%+kbq^b2pWt(MHYyIs7xu%>gjh48UhXiaU*R
zwEa`3dyJQojgOWsb-YJ81)Fa4p16{a^BNE7JX(?oAm|~^fI0BDTOCx!v!L0gFxvLK
zPAd~=L-@ki?338>{>Gi+OiQji7mh#-rb9Ifr86`ZT%U(5JaCn^Hg~oY!dT<JerzDS
zv`n_BZrxCzZWAjl^&M<%N_yeQ6!S_j)e#=^Xw%as@^1hlg6yAa<mMjxqJ!wB>^3cV
zgMd2KiP28ei)#D99~6qNu&>;ixtCGjgud$G>2o^Kv{uPw+b&1O%UAg}oHWwtcYJ;4
z%45%GvsYr5<yZDo>Qi={=ZPhQ=eoq$fem1`)+`ur5dXbX6Tx3Em36J8*Vk?XqDn?3
zqignP^~IxpcY=<N&6&}Ir$Bx}8|Kk3s&Y9+78fz@In8esTRGq{1%ipT2(Cx>H*xRb
zhh^%9N}-)S_|g_QTfF03{Nt$+5l+^sUHs<TiP_>>ja?KMX0zzOUaJqwq>+-0d^r5Z
zYr!so%XgNVCoiMH{l%)ETb{c!|H#uPqd`eZ^PTtabFLj67dTo8dTK_^0JKNCHV^)s
zALfWvt(-&O=(h6+V)FmG;k`!C9g0}T4sa}e*z&TVEgK1=49u8In=56JaoRqezTi{H
zf!2f-^;?r*MT9B={wmw{H9n{E7zi>6=wbWO@MKx_ZuXGot@H?n&SU&+cC!HKUrtWw
zmz5ho6tKo=y({Ht6_UKEFj&;EmJ4MtHF@I!W{dR~B<%6>#~wrmA+0jT1I)K~<+Yl)
zj(4oL^%{Iht4N&XKe-<*&Gz3|L}y07T8~^oa)AV~@Rm0ulcGK{zf3<{(=R4LP7XD0
zX2;J=39*&(W*8Pct`7^$r8a0XCY7<dwt?g3EORPk>72R7iH99hRDq|0>9r0*f3`?3
zmLH$&YyD#{DRC0%?{el9b7II(41_GheB*eACl;oBs1+dYVcw8xXpy_Isw`|r>>}uK
zdb@@5q_{_`3VUIOr$jtK64R*uVVT{^U=7{W|0sH#9BI2uNU4uF1o_^(DjZ0FI)($I
ztutCYbNolI!jx6}t=md4_HHj53xOGuBK?;pUAz2bvhB3Nw^G!g=r*Sw-ZpF-<A@5K
zwjc*pAy-37we1Td7qQ7zgP~n-q%ylLBl#N&f78jeTDodbZJI=AE|fPaG*Z^)oIi(*
zRB!&m$Z|w(j=6SBZe7a+(Q5IK1^5?Ps=DO`+a9TQK5Vz>mWlv%`0Lzm>at6|5qE|W
zfxW3OU7d|eKr|G+O3sC>h1hg1o_2!blYXyDV0|1CXU-GER*?4^e!_MOPLlyoRcAC@
zliR)tx8n`uSC_HWfPf6P4v(*Yu<Bp??{kAfZaJ$ix9}>SsxS{oYE4uizoTjW?>>!a
zd;fPWnRkRcf@E76m#fxhhlfIAN(g8^r`JWh&U$U(RGuzHV$)|`UGw~T1JJqb`RZC2
z)5>rBQ%>6z7|eHc-;AK|)9j$x?;Bl4fJxZ?;(T!Mvno9%zuc>}0)Iu2Q=q&z(-t0D
zv0sgpdSP(kp)i|K_FUIzAdD>QkEmR#w>24{;e)r5fbBPhi-AN8rvFK-osnE+uzrj7
z*JHy$aYB-ZdxcvT9DZO4NxcjXZUIGz``j+6ymgF?)^N??{pxxAvO*>COjUc%*)2V3
z`8b4lrn&Ez@%-yC(M0<x5CY5Nm<$yqRK@gGn36@95j=O)npvjK1}-!5;d}2`jxaMm
z7a#NDHF`Jb7G4q7THNiQ&hpE=vfJwMm`nNrH0;W(1U-a!H>AA}AJz~&xtGv%off`A
zXcysLf6PBaN+)xvI3`1wg2nIjeb%>=1UcGt(N>Xw9_N?KEReifX<J<QvHcYb`{@j=
zhX{#!F0RUT(C@W=7sdP9zrc^-VOfn1)v63hVPf_0F0>~q&o8c0CO{sz2<dWHQ#|Y%
zhRG`Zn++lYd<iP!$AiVGx+^9DcTYYBIv)B^LreN}Xp8*&GViRnY|BTu--bmja%=TO
zdWWRS4tr<(zUgt=xhe^;THTOV8`?dn?4gBwNrm0|6*r23DE)9Jt{B=3p!{uG+VCX_
zGTGO+3=%_rliA_db!HD-aa=B?FSZjD6FfGkx^5=6y{F(}&XAK+SaaX^6=M&;h_+Tw
zo2fSqNVsy1R_AjX-oUHIngp#CDpz(?9$C9I2sV{?4}>k;=RO%4PF{GxL9_bzAw9@&
z<&I)<{7&8Ro0IGzLC~3CG_N1?v6>?l9c*6QnO44g;Ni~8u$MHfXol(Q5*$CDf$!sP
z58np!w9}>3ug^0b(#x&Min@Lc-fx!SQkvb+Ch+=LuT4eVP=1|&AD8pV##a>M)d)TY
zlGWJWmUQw-4CS(hl99kueud%D`nWK9&z8pOy(ag&!IFCuRhbXV9EbB|RVwoMXa>3}
zOI25YFdV}7!G(G<P~M<NhAgATs!6!39ux<aX<+f{?|&m*>%hC%ya&bMAr4BV$u8;4
zSE|l+{<>SK46VD|Nl`h7dQa4@D}w}gpA%5bB!nUvtU5h!eVeNw!$>D>Hx(`4&#c#s
z7javwpL_{zOZHJegscqVSZ#0Lk?EW5;10|l8t`!e)WE!OjQ_?9-)#RQO1hxkB>Do>
zb=fRS?7ZQSOAlGVBjmN5Uxe0ps#o^<UZ)D{pwK(qm&aSMAbIN7(Pf^pqsZ;3xhuB{
z!i8N!xyvaiGDo)sw3d5F40`~y+wh3ndCvxBOVqZqj+^n)@<L~d>KhkyIxH_@q(^O7
z+`RaN){e%Fj#BACzX6k{Rx!k3d<;v`Xake9KKE^0lF;<9)s(Mq0A!PsRYo<lA)Kwd
zjWgpwEhp}yXw><{4faY5h<<U^_u_u)qP+g7KexEi9VG%-yP36mZA3Plgb|GW8;<j$
za|h5he$5+3uoLpA^R~G&ebPOyEOxOvpH6Z)3UwJ77|p4DyNqu6j?J63_Ll=fc`Tb;
zGug!8=(%NM&C{VLz=wRGb0IOj%)F3Zsbu?5m<Up3cTt?EpDz_*L(rBHlWH?3YI{Cr
zzVIbXopJp$hMz?c=uYrW3S)|qaP{e1m3i|Z!Pnu0l4}PT%J1QyA4yK8f_yCmUUyf2
zuDoeR0dt~R8vNf-$J!Bp0qIUg&mB^pgfdM(k4${krJ?EJw_9TEiL!1ceb*#Vg}D0F
zkwJTWzSH!bZT4Jj5@*-+-jB7BQ58R1G}9nqndmFcD*iOLA%DKNmYhfwxO=Mp3c;ph
zFh05aW8Q&oHVWIc1GvxjC`*qVf(_GTS>>Xe<3G&iR=FWHsoc_z8l&O>7rTYhH!-u8
zYV4ix^V<uU&_0u*MuRq;Jk@TTRw;J}8bV5c3`!PwU2T0_vfiwwpFDq?ICUES*5gG2
zu|c0VFnS~L9Bs#knLeX&Yt!^;H+PrVR3LbXLbrH-)kO03wKFsF@hxH7w^vva_Rac;
zeaZAo-#>BFbN~M+7K_8&pn$@C&WsBIXq@=8FI2X~ycACS!aVk6gxZ6Ec?DQ=H4>6X
z<TbbT+jI5+=!9;d#n$XIzl75gzwBpk+y+#;C2qY~fDpu@D<03b)!jV-pPoLSL2n{i
zl2MPhWVN}`WwOlpF8jv7hbFh|uERk<sc_Qm=1*Eu1<>!=@$yeS1~3Nj1ae|Ce&1>S
z$^?=6@L0)<of_}sG8Wggx$3nOuy9?m-qKUIJru&m8EvN~DO8IuIkAQJVV+ojs?)S-
zDY;1xanb&DY@|vazCwV%B!2B#e-za+o4RPHd`5Rq2iJ+-?8t%=IVS9=!)PsKBZ~?y
zTWf+oSB)d5ubz{mT9-<|03#>f$KJd8enj_e(7MRY>RGl+G2|<DuNb9yE3c+j7~F?>
zLJxY#(fDe5?VG4HAN^_NBxhZE7Vsg}L1cK%^1>GzIAgc5_Sb7}*9=FN`E>s9f6BBE
z_XaAKue*1tKa7j)5&&4(Uk;72fWwX9ADhD~zUaN;yPdqz%1g3=lz$*}kG%K93hE7l
zxs<Tju3$u%<S-(|I7t`Kro-(Zf4&d3J$+FFG<9dhsPA8*)9(2}rPu$fKkpzWp@$4B
zz$>od{a)c3Kfh4Xy#CuYy)QDp3fc8gX${tFHq`95dTjb-`>_>#y&a8IMOlZ96k=A8
zz{>Zc>U*;8$}Pe#D`or5czSn3sOgTrc)|j{LeQK3lEV@3##(h|6C?ZZv@74U<aS;y
zJUue<0S`*LUC|kaRwz<@Y8xAHs0Hz}`E>0f-$~dAI@q4ZX$zk2>qreS3wStO#bVf@
zK~c89&@Z<>cR+_bVC9x=Ts_(iw5NUVv|Xm%S!u01=`^~2YuN(FxNCcL<p|+EHt%tx
zM#vU$<a%C6f|PVM>7i+PLCICh0xtN~tlG2gnX%}&e1cV@D$zwNh=O(cy^DnFd7B|8
z>)_1l9zg8xfkmrZpj9XQ^KT_fFvD8l;dcEr{O69k^@Z8hdE%Tb$oS~So8G9?_7GKl
z<1?m`_0O%tGn?)xcXN7G)FOa|_Mo+8Cnt%Tl2Eq$2cdUlf-Ohh=S$>!(!VrX<Vq>Y
zz20sFo8Mri+Lm*|eNzhiG%M00*-JqEO=B3)NUy{#zAND(paogij8YY&Q=RFlf~bkK
z1&|H&N4S|7Efq9x6T6hR=_RBkyHOvyxCTtkiB6ULO3e!XTjM?2vK$H!65#Ub3Xl6(
zD6@BJ@uO3hHoF7eq)OC>^te|MQpI$e`FW34jrvqFsR3lCGP#w-^?8SOMkJ5fmu_p*
z2+-qWRtr>iz(B-TH!>8SWo5BkOn-<dN|wp-BNU&MD?bzr@lC?_xcuIflOJn`#`&MO
zlXUz{RnN&)u42~WD|1`nE#8g~6r*5a&E*U<)L~v5S1uWsJJA@L<m70|ed*5fkg=t_
z+a$71=_E<4ff9;ooGz)?I~g%t2Cbck*Q3Ss-1B<Q$4g1J#div)v#Vm}uNXi@n~$y2
z55*DSKIfe)_x*Ksf~NfDO4EVOc!kEy^EG$eWjn*4i4ocRxzr}RWQk{5org=LC-@-;
z)p%#*SYDGT(~hsV75w6A=+v2A!TkA|))!DN2g*FYaDDe%^Sju|2oD(*h?2J9@O!GL
zk`Lv$lOzE%En;QIJ$mg??dV3RO>`C>e}{^vI=DU(YLv{>Hxaiwo+0+twQF+`PPbDI
zKMz;Iien&3S0KqtGfYe|sIThtSDRe`vdE#YHP;?G&%7p+%q4g;q%bh7`s0q*Q>_u<
z0zTrO2a=3I!S3A}68NEMaQ5BR@4dlLTkhSn6W-GAK0OH<2Ru+04egLI`aA~?dTAJo
z7Kn7JeihwZH9DPxQq85H*79*9OQO}W&gU@PSC#lr)17~;$4C1^sjvSy*qz=%NwU!X
z;>!KCzWLdUmuTWnJVDbt50m4NfJYA*7pQVIwE?ukQ#QQJZcZ#=2gXcD0WxpLGnS>`
zLq-g!FjOAVKGLCrD*VciVR~~>p>q^hM~|&X7F4u+Zz6KPa#F7XF4~zwIy8MwZ%fx6
z@kIq>rHEQ~DC$m`Ky>-Bclv$IJj8^F89Zd`#R2lqf;G9&NtUxu*ZT>H%=0Qz{94H2
znp7Asp$$L+l^74{Z}@}lU!%JmFr>YY2>*RAA^3;MUp~d}LL;h3A9aUoe9?^3{hy)J
zDgLp$!{K*1{nZHiUf3N*$4!u4l1~&6c{5w$>9VohA`-fCgQT6}K(0*i)6WYjSqcKr
zxy%-htM#2!>%L`eDWRv`e;0!F0&BGrV3y{#*YNajt^|iZ`?L4Iksc{!6DaxU`>rF?
zot$T6AXUTuW#YA`D>N}7(ZKByo&wVOdc#`_7>*8G_;VDK;>M`H_<zy$)^SyKUAM5U
zgrszrprnMPbXkCcfOH5*36h%@P&!1q1*An<1U9`X>28os3MgzqX*d(TpXa^b@0`!S
zP=BzmYpuEF9CM5@7fta7#qQ<%sG22f%NX6>(l>^-4NDi{r_>6*MQD`F-`)G$b7B3N
z-*Fd)XrdOzUO1G|NL1&VI;@VbJTkOG7gzV7`|a5hZkXOE(C^Oh^>UexEq-|VEz=Nx
zpl<~I(Kle)gYEQciu#>P|IX<2(~=RVgCMnnf9KTxvMun9LDZ+bOU<d-^gpu)r!zA1
zY+C5#+%zT}^0+mM-QvDmnk4MfElVD&A+Yvm=+6F`&VYqtROlnJ5pK@PHGjTw$<vCQ
zbStXi{*zNahx`wp^XC)f?t^(=q7g<ymF)+YvAXNx5atE%W<*5cM@i0Jqrh}OeiZu-
zM^;SwAzxQAM}b^#6E>TxdlW*#^H7MF_g(A3py^d=g_Bv_Es9SLF(w=w9F&4DIIdyW
zy=eFFq!mLD(9kr^d00H$bO|2m?=QI<dEH(7M4PPV<P@2#6Vzq&;rBmT4ma)Ye?Lt7
zug|TGVzY7GV7b0%cymQhnNC8X{WRaY^V6|Rr=+A57Z=ZnbjV-9!pc=I7_aDV&PFdU
z$EBnk7uXxVZfk4fHL1-|A~=0bn#VhcPTvP4O-5bGnc3&=d91FTF4Jj@9>|xZA7jYb
zZ^dV)(>ikfkEuZcCa5wUvz2~*woZJiLNA!`ivCKmX25YKn^wUvc})xpQC}F1*`=!q
ziBS&S<680qW${9O;cQ{|GF>DxCF{NR`9I5tPCNgc#O&;x44drbeiIV580=x!P~v@p
zr1Md`saLw#yWDr*&c5&+bGTug7tKAkYm+rpdUeNdH#Rox7HXLaF8*1bKLa<6u<I~z
zBmNj^?fToQ%=5?yw?ptVt-{HVX%Aj5uD+f^ovp>Iw{UYe4`NGqzcm=^Rv%}n&k$jA
zZktbgtK^fo{Z5MPh<;65tl69O(Brbd(K-t$3WeO*T(RBxIh7z_>A6~RsCLCi;-yiz
zT(GHuW&`W%2&=IY%Z+a?90%lYWc{~^rLkmW=rnTsx5i4%YHMp}su0Kcr;p_i0SzN}
z!hidnV)RsvTT>_%Sv-NwalLolOyyRYX_uW@_hbT3+Q^5yO}8HG>G5J}JzCwi<F3cf
zyuBA1>?|*DTJd54jef^W^io<nPd=mC!%cfAkA&yJPB&>rdb)?Z=dmp&i|t;4`u)tu
z)F(f);^onIJl!jrgS}2<G1Fz{JH2UMMQi7A=R6V+IJDxp`Fb^ir$_9~pDSBb<{{L<
zLTg51gS+Q6kmY$!s&fAq)t5>Vd|aIuoeVf6HV<ahg4MBl^{vOZ-`$ft*x?@1-Vu#%
zqQZD?e3PoQTj0{IdToK9rV=nnd8A$E=~28pmA^Tdt7_JhqFZT~DM`OO@+Kxa+HrN5
zGxPD&a*HQ6HtRi(9Fj<h(`#aO@%}mm{v7_1xvd~UKP+P-qwI&;Tl4K51KAp`i1%jP
zKitg4Ws$c$S{?1WlN={1Qn9hdOw7zq^y~$Ht0U>e-Je@pin?wuzkdC?HJomwKzpRG
zk8sSf{oQ(Pzv=h>OznynS8;F-Mg|=8tDUUtU!5f-C2d=YjErPeN)oc0Z|kx9|9Kw!
zV8NFdbaH#7Mp~WcL4p2KH)n{;uggId^jtrq!<Ab-4oDr=MknLyjcu9_rfb|hJw2DV
zeyxrq@|(YliSZMge9OqgvvGWMz;Z8&od5aTh8E|wQOt55lUffw)_Pxte`KWUix)qC
z|Ck^qCSD!P9Y~Y8Tz32B&97ZuKbHG4e*gY0<A>`)6`IagmS!=OXFZLi@R<B;JyGv{
zJ|iRJ&Ye3FN1Cq?UZ)rLY|#Ag9ld7bFpCxnVv=tBY&Dk99DH6MDP&=2h)Fr{c#jBn
z`;|aIm+Cava@*rV#<{(p<m>qwg|{t-IapXOV)5+@nS6~pe;Kbco^NTg(jE&7tD&L6
za^#a-7!46oiOozc7V&j4_g#WatEsB;2iI4}%U3sMo5HDuiO9%C>by=;?zl0wrieJM
zz&kAUq}q(?ArOe6yeD)Lo>p-S)$iZGf7bpk@wWBKbj?9F9S*@d{^<m1t<2vn|0lQq
z+I2@iWM;`s5w_2F*-Bu!C)Alz^S$01$I$Q~^YQmOA{!}L*=IGbzB_C2*{L+T){F=1
zK|9gx8VQ0nDM?9d-@d(#j5M~iq#`FDt#-BvO4rfR8UFY*GBvdhfiWt3c5-|~CHP`{
zXGh{CS~i^a&NXZ~2M34Dd(leCBAM#>qE2hO=w=)ZjS*5#t!vzR?yX@oem``p9CG39
zAv4%JImy#Kx%p(O%CT>xHI!=Sd+RyW!TG<L`+udTRKy%^6E3yTQd<N&BcXm~KebR%
zkx7T&iBpK6u<LeXeLbJBu;4avUVMpyJvuoh<>&J9u2Xb&_R@TNWaeY@RB?|W686RU
zd54o@4<Gs%PVL*YdoL<pz@|)9JJa1VLZ!+0%ZE~ukdT;|n+K*dv`$w!<{P>_PYVz4
z8OT=dh~*Cdu8r*TdpqI$y?<dK{9?e)<<n4^VK`RESU1b*CDySg`@X0C-xBkuBF~kE
zVHVBH6LNAi+S|WFIM_Km4{N7J>>#`pHMRx^IfBxYlY<kVv^qB*Y%X*tMyo!4{1(4s
z!48(}<y_0dRB;+C`=k9|__SgH$C}knKbnKd62Ct!d)^n%XEy%Xx*GCB*2BagS;l9J
zY-4Q^Oz_9r$|`~1{BgQ$pl-Dj#pmq4=U=1Q$p{Eu)v}QWS6GelT8$RP#gRKZ?<h-<
z(ofU<nYQ%b{z6R`lPeW&>v->;;E?_P@YpHR!p?3B*vUlsWL?%XG9prif!)19hp1B+
zR<E!HxwzCU<JW4efo4h|;(}kqzE^~jYRg9|24Td8=Ex`2!IcT=v^U~o2jBMTzlev$
zV@nYbZ}oTNA8BLou(2)r(&g;zcuP#SG4SBNm#$h2=JXvM>_lkTOnkQ9+gz|5{wS5_
zKHGTl!!iUJHSaUUkx#mXI+Z>|Icj+qaY)5H_RAE)Y5ly>%d4!)DM**C#fcF=h^2gZ
z@!{cN#XkKS*K&x{YqO=)ILy<;tiLtOo@afpTou3YJ%-cU_i~T%SgR{0OM)Saho8Uf
zUNkLafIojEXkmhQOJF;ay8KmrIj(lIIvdH`84-mu8+`yxw6!N3msKoRhbdfKHoa=(
z0%gnGcE+T97i&B{##>r^(vNE>urC~%^%z+WuN|l(x3;#J7#Z`P<p2I{ZTRTXqvwGI
z1=a;x<-LvD+YTQV;JCGHAN406w@$mf5{TIL62ro(VShr%q1lZcqan+|W+Yr~bhbEy
zb+kIA#^}xj!%e5+b2rrqNJ#FGpc)zORlK?qv`zZ$q-L$=pr?<;<;L{-*2adFlbM;R
znG@eudw2J6{>aCtrE6<z*~+PnEs)1UJ?cCTe*gZVt*WXjo_YegmW*33E-tRV?4+(2
zjXrn&Jf!umt}e*Hj0IKSwidFK>5!HqK7)fH_Y~4Ud?>P+*>l-)VrJRHy1=`)7hN~U
z$;x`rN>}LmYyS-@k^YJg{S|a;#&Ce3)ZrQTSj0%ah6LA7V@I>f3!0cuL-ju(&Y8m%
zR^B3rFGvwwkUB)bSF%GGU9@g?SWIqiU*j9%{(;haxb+KmyY3XU;TkF`Y}XcJ5u6<z
z9n)365KBrss@cQrH*Q3|ydTp#$9S77mObd9rWmic<n`;<Ukrb=mheM#wBOkNKG4u^
zt|l+<l*#l+<!z0__mup6fB#m}EXJEO(H}YB)tBXhog)7~&r#EZn3&N1F;$%rWrnP@
zo!#D6#$fIyBv?4E46-jAO$4Q+&^GyVbFd*A7xH3`Lj+5hLxK<9#>Z!7W`4?2SH4!F
zXjQU4T_eDMxYtTxL#tf$EO4qN)RM)$<p8odG3%!r9JmpO6zdWv?!$RQY(zG&U_SH#
zbolU9j>rd*{7Piv#25+tZ!;5<g$0te={sKiA7)ux9OX;}mKK+mCdQ6jG3j+FdQ)G%
zpk}=E#mxJQj0QfCB@ZEWh=_>9-FFR)j2h=KdDUyd(xnL+1pj=(m39Wq^?*E?JT#}M
z<I30WWbJUuA+w>pqv`1#AK$6ysHmVyeL0kNk;g$qfwuDGm*Uo5m64_<o0Y+<__Wl#
zg6rEoWWFby(^67WT3TB7W4XGzkKNvW&dOTc|Fv3PULK69_BuIM%~smm+iU;&Rs9p{
z@aTwJ=Sz|G#BE_=;qSv21D!9zGn__<&s*xhuX}S^@~(q|){Bu(GtNI9PPTgNqls+d
zp+=P3*i<}^I4UzI7D}Q<_2v#};@}MYnDAP#MX{=#Xd};`KTjoKG5_-?Ej9Jp;^Jqc
zW^5Xo<Xgrql68b^ckbvueHuSDTQTD)>b|>CYSy#s?y0U$Iv-#2EBW2K^sKDwDdCJ~
z&+ra^w$@HlAtxkEd;gw(s)`=-iJp1-Ph1kblR-Zuc;a8yOm(H!{qE}mVUckTO&+-q
zU2<$1$i0^@F~3%RZS6>m7EE}_R(yG)5E7S?ob0ynHNs0=PtPG|FGC?55LHA{(#M>f
zfbfWj2&g5k3hwzFddSfhm~;9$r4UB#o$9r}ZNBlX0m`H0G;$A)qoDQpg@)n&exDdO
z7nfu)m;S-Q!JZ!ce$Q|9XK`?FuGetu*M?9F^Fpxm^Q-CU=~X#Op>?Gckyz=M-lgiR
zv>mKwkJM!?=eYE2!ucX-{=4ntXwJl_C<D{mxeuv~@xy5{AAC|jrdjVvB~Ik0uv;5V
zNRv4TL}?F?yWX3sTFF-WactHv*L(oT3i9d8moHV8lai?I?d_SEGH}SabZXtl8X9IX
zbFH>(bFJYA`}^G~cSHpQegWEn;9s4r+zO)+l@F&a$j$wZMnm4<xBXTR$@yWj$WUiz
zCkl1pQ}^{iUDK|Fi#yjbpSb=!ehK<}c0d&M_P6ojXnl)x)u}NC2$rwe!vW4lF}*u7
z2}oRS_PY0V-X(s>_f__?1T&#(&50+oO|1+vi-OPJug@08cRV2ww#Rf->R=X0ug!;3
zS27C<j!aCDNbetee2D1#Hjq6sK8ibQi<)_+9)n=Le%->-()eLd%AMl&-d<b4Sm(U!
z+uL)cd@tR+dGq-Acz1baWd$0*h={X008j<fvGMTyG8GgQn(7aJt!8IuJMV57-frhJ
z>jtprkE*6TU2aN6h478~9=J(l-TM97fqlkGHy$LrN8;u3$n|Cuhq`NNT&*WDoVb+c
zR(YA4Q^Je^>+-JXu7ls4nX1{C+1d5$4|d}_7KZY(YY@dXB_)q<?R;vRw>XUDma(>q
z87?dw`110|A77uG;(KMajoPC}$JuvdSf=mZy}P?UH)m{O(o7h*GhS|4sq?e(4bG3X
zH8C+UBog`H!2^JCrk!!O?B*4Xjg5D2``p?o)35U^NK5k`#l^+NqY*(}zl+xWahjB`
zqR>Y310O#6NQXIku&jGt%@-AgfRf!CWW*?~|0Kywr;m(b!7VYj$YWWEkGYbL{Wcwu
zot0Ht<?4BSD8S9lEhyO2ra04X9+jRhoe31h(cV)2(ZL;YkNy6%CgrWgP?9SIPo^cB
zu94X2>(9VHGcz+oE5bx&YHF(aO3}^LwL4Wp0y+S|F-6PYzI`KPl6OBo+=DlR19in|
zzAfTWhWy2g7r(Er&eVCqS^7>$NSKh208l{n$&+S1t9RK-$v!b&yBpruHS+&PnKYd1
z3y)TF<j~1|M@AnK&PlM`-?jQsj1?&O2q$7i#h%umiTm9hKli?(jeEEPW`wpv?vKtK
z_HeZ`1PARvw9`;iA08gA3>O@X^ffn2y@Q2hm<4oC!~eXOf)L87`_50@nOYATIfKwr
zKD?j-z-f2feoNNHa_i%<YG`OkIIYD62M6cpTLRpPXJ(iMC|~8cN*d@4NBJRvz@L;I
zpmjM=P)dzvhJ8Acb!Vc|UOB!rAvU<N+;Zs>Zk(>>?z^T(?5aqLFDzrCzE(zPOKWR(
zbtDUegoM7lynLe~^I3F_?X0HRQZ!M}PMQpBmXfZ%COOCc4Ot>gjN7r;C<+w=^*X@r
zPfE^l{7)G!$j^_Bi0CiNeWaIC)PG}h56P|n82XICs`6XmtvB9E;<|{vfSzl0<kn-t
zbqoKVSjf7|Kc|8ps}8W!1ipLMI9;=}*g2Mx<eu2GM5d@xYiCyyPFu4!W-kISM6mGB
z`qQ6Qqq%C7wt_-zN7s+u+3d+l<YV-lT_trNiO$LsT&ly}TeAxtZPPU;=P_J6t|#pB
zO6@N<5y+5`wafS^C*<gYPoGlWy<>C{d+h(uo5YZQ|Ni~!*RMm#I6m~pB!AzjMy8AP
z;mW~;jhj$X|KZ!f?2S4vmUP6z*Glj6OV>zi39broA1n$hZ^!E2vnolI@c+KNy>|<x
zt)$d2Uey#rahrnTX>^#Zo?iFF0bq!wH#mtdVib3OB5*GF6cxoa2RomFh(}z^(<YHg
zNn&2dhdrqhTL>ij6k{bPC&MPME-#allYg;aQo4V?<7W3P)B>BSsy8Q3pFR~45lI$x
zriJFVw${@zcCwB#P34PmE8bwoJC^%-$5HghSO2^Q!%+Lsvs(8(5$o}+%=@fmHZxVD
zxbtwel+F+CRlUzvg0b3P44ZWNv>Sp|9PG-qu7<N}mY`nDwUiTy&v|Sw7Y7n+kdgW+
zidT-OdWl;OSzP;PH*{}9i)1nUk>Wyi*%_!O(4?6|9yBsSL!)f){T;J^?RGF3_v6Qp
zA$_PldD4Lw!o62%zqG%<&r1hg6CWR6F+CHQ1~2gG-@r()NJ!8<@q1?j=RtIyft%ZA
zw8*d%=jnCj6rCxTC|)S-&5nI#=Bm27Q!Cvab!7j+0oY+-`CcbS$insY8aHiQA>qsq
zPhVoxzu3-xZHoxLbEo3)&|c76Qe~}n$2r*9&TbPL<IvF1`^Y7gn%<9#B(EU^ETDRu
z_-17TS+ZN_xT-EABXcEscNi)cJ9|E$6n!M(mYCRNLxa@AhY#P)mz0+F^?U4YG_<x}
zz7<PIv^h}iyuPzO1G^PMC5RVl-sE=`+NwgRTWh1YE-Vk_lQHIchc4pF-@Au>kG3Z2
zGCplwVj>^=8|m4&TgF^EUu?IR`=BUlIMZfg;aTr)%)*h6IW)U2O>|n|R^I+xMd8tG
zCW(7QhWCiyzi%Jdto`%&NWtTv+s-NF*L-{G{g{xG@7Y<+Et&sCT6M?!>{XPV^Yc)M
zPB{d@bj1$`JL?iIoAW;=DojjF*forRHd_v6zJ6UxQ<FohET<n6rQ*&!P2&4!O_FOE
zQ{20y@jn%MZ~i*VJT2#KS79-<vN}@O*VlLH(xoSF({pppZEZIqS(Qbc)-<)^Iy)bf
znLthSP0BDI7#JX)3Hps^5fu~j*=^@1w4dtEgvZ|`y?`k=27K`L?b{g6lNWuS2AE5c
z*B^@5%?BhVQjz9~a-Zl~L?E)AcY+>2dV9Bn=+YR!01M{21;b+!8?Fb+x>JXZZwiQJ
zFr@SI&&tgChQH>`O;#FLJKBG7bFM3XIoFR8<>4uU1kbYqN%b3*J9LIsRaM5VVMBw1
z^N0572{WDtKY%z{ezvi92%GFJS$`aqnvkINOiWK*JqC#s<Ey4>^2etG-Z-R1Sx(Lb
zU=|bv>6`s|I7I8%r<q8AK(ua@Oy9oOcQr5nOP%)7O1Dy|FX1|%ETrdAmH9vxiy!n+
zwVp?=k01B<NSt&EZFD6HGOR$qYSa6^0camuarZ09rgHm0Q@t(o@j1uu1mz2IFm*(p
zR9sVIW3OUb7pxVn1K#SGP_q=fshOGoXZ#4JK=f0siWgH*+W<Wi6A__Cn4(3Ue}2<^
zW@19<1^<Jx32dT`*a8M+`{xvu|Ju@$|5Su4w8CRj%|fKiLE>KUGPZCc0G~mior1=#
z><=F%Ydu88#c4wkfHALLy&BIWv;xgC13rNov#wJ+-_BbomEk7T8I_flett0A_h@f-
z_q`$~2Zxc7(YbTyYCR6J3JYz_&D-%rBxxI)noOQO6LDDX{iIuszI)ngLx7u`o41+8
za_NS^0qn%;$HvB%E`kb%emgcmFDMwB%WGz~ghHWM?#H~p@4HFO|D52fArxx@n{8Vw
zYwL)sQQELRHLkxt78LZ!<?d1hq~Jg9>F9uZ$ca4*B^J_9+a=yk!q7?~hh+jHfWalZ
zG#A$v*4Ix`PY(UDuGb##Z7s#e#|ya-V0BwN5vPmTFX9rxJ|{sO@E&e1+*jid!ZW);
zyx0;-#a0ISHe<3SgaVM!8zVX~1P>k(tstxTMvHlxVv1WXrvp=Gg5xVsnP_oC-8t+E
zx;o+~jZGSlhU!57fPLaKbXP}vdkT-?Lt9%-$@*DA8>7~cr;AHFZjskR?-AQL<p^t`
zY45nckA9+dxat}b6qInw_>#yiP5Jw`jZgunG?TC$vyc#Z@+$@zKgq^EKFRY%<^$Ku
z1R?oD4nZO?jQfnw6<<tMT{M@l{dQ&_`+=*<%Wc-)=Y|w_7>J1@uTtGsJcTdpS0HH*
zO1>+7BmL;IQdsMg$T<yXCktv4LK{W0greSFl{mdqEPDx!Z7GLyG5%?dtH3aaW|6__
zZ~+fn*$d@Vag&42A{2ym!=%AFFad8kNCK%jwF>oWP$Ts7y=gLi=af>#r<$8HQ5?-J
zEsVOOoBf&h$7Zq$_3MCHBd4M1a+h5VjBx;%-zHXii8I&N_q-b2W(bXh$3E(#AP>(M
zhwnsasl}HeJZ7dl`d&wlP)tIqm{NOruJu6-Eq2DAzhnGc?)+80*QQLSHa0ekHyn68
z4-tZbf|@S__jWcm;skA`Arp@ZK#(X}HonH{Y=tAo<PYiC_xNbDgZqL1S^gK3<Sf3=
zA+Wn*N+sz>OH6R=np-Xfr-(Xpi4pU5e)M+BD4ktdx%sKE=qm^dO?+btxYw>-%X$#+
zllMBo$g<A!s5g?ZM>w{_W8>RdhgiaOIMEqgkkDF{h+jRHehm%(jh#uEL!mPJrS4l>
z1!y)EZnnFFjlj+KwHtQdv9`7jTXx2FbaZsm-%hrD+|cr9x08}w)Go}zC@-8T0mq>K
z!`&Qi>TAT&DJgoMp5iN%V=Y)oiHRL*+v#$_w!FrbupMhj=QxC@qns-JWfWtzn5F}t
zN~hwR+wmgT4t<U<eNEW(S?MN}au*bMj5L|93+EFkKiXWN8ZSsn(&6O%C2=e?o)ht?
z5*_L{IX*7J#Rd2RXX@G&z62L#W#upYl;qEat8gIe$?(1M+k%Y$ytDTm92@JITAI*L
z&<2ap-Me=Wm}=<TSSgvUEM{u&=+r2ZLe?5DGxz;W*M9fEVMkDo=al23rluCS6LYqV
zx&*tc+HHqx%a<EP{v)=wR@@;U)`>>c>2=yuP0jlIO-)VECqB2a!MDf3!*jB;^VyKZ
zDc+oI@_#=5{sBQZw7M166QnI~)JxjtG&W{|f`wYvz{5Tr17I1K;?++26g7{*nIvp^
ze>H>SvuDqCl7ENP(B$9up%MLFQ`7IARHv;@OiT<wiZ`VV4Ul3z;Q`uMoj@Bi0|Z{}
zjDN0YX=~P;M#Mo<CL}KHnPT(kC^?3H0=hL>Vq8{#X{M9RP;nf{UbxWhwcY$mOqky1
zKW{Iq`o)--l)T2D6B@jU@$vNf*%m-?O|@{8!vg{&S8Pq=nD_CQ8R&068$zL8-#v$o
z{X@a@EsL(ME<zMVbUA?bO?9<sBC&kRBOx484#|@XzkHZ&uN&<6UdB)7@0AD)43vuh
zwyYA^Nq=#1@i6C61~fY1R_`_VtjEhB0m~~W6lzy+iHoP?{<O=tn*8z-XwGLTZI=BX
zu$&Y;W~1tb1C14IE*sjK^5TW_U2SdqSq;7PP`*Y_f&k9N=8saBUQ8nc8=qF%nas6>
z0`BqNlv;$!Q4KuFBL9oN^oz&F6tFLNaKya5z1#RyZ)9a<DW`}vaMhh0i((U>Rat`;
zMkcBGvI)V7r9~)$7={1Jp;?rhTjObEe#MTB$G$_XB(n1PYiXH`zE1#!W4W1PxTc`}
zPs@7P0kk03bZ1Y+n#XwnVP(Am1U1AQ8PUxncf@IhY|ki#l{p1T8o$!JsO8F!#bfL6
zvKN<H++9;_C@LySuf{b^nB&;3fEk4HKfRCz&Dy-h+pA?#Yg3KVgk5q`*QYgKH8(88
z>4FS==GUA4*M5E_7@3qlgN8Gpk#vSg`caE}9V0{96&MT%Yyf4G8$h+WiJ*s2N7^j}
z<>#}z2kN$mD4JPE654Ros{uXQ{raw=qCrdi-1{)vJ71?IQiWbJigRgy4xx~02@w2x
zA*L;dQh|RGH<OHl0*ma8X{m8H6x~`N>)Rt)Gj4E;A5kG4FzDv3r5^u<67~pX*$(;c
zfQs~V#>4oUsi2$7gSo^C%$<r#{1vPE^v27=^;!{J3jA8Md)|eYE?+L~`H^wCzd&2?
zS@e-`Q%_H!^ZGQL5wBk>ByN0rked|GC3GF5-=?O9XOZ?FLxrQ}AjVy4A1yX!n9j)s
zKCYOlDh<=7z0{p-xMng!)_wVc?|mlbgJq$|f_!`jyBI)M_8^}%<Q5?^BwmTV*S|(Y
z)F{@~N_c1Ul}!CCJ!hCA@+G~{P=tNdO;K9frQZ-veSJp$xI9lzWRynrg8#H^GF3f@
zrG<rGtAdtu@~ukA<9C0H+~V?YGCOW<k%v-ax%2altE))!x!AvaM32&Pkdy0=@p2t5
z=h?<L&vAi===Cdyqd#>nU<c3}FwH*4-VqUf&{Oob0O+E$2p#QBC}H|2lumka3J+*K
zAn@+JG!?)LbbfQ%t)>s3#g__h!mHvsmyw(qxWip{cuv>sd{bnifTiE1D-UVehXA0!
zmdck3AT|uFPSwECVy0_hzpBuvmX`E(U;C1#4;0^w-^y>d#NSYgOTF~XoaYm;;J|;>
z(?aswbr<JzplAsuCGN%-mR&R)3_;*i^0y5PMABRcQ-nT94~cm6tXFgZyVN(HVH(nc
zf)?<i=ui%)1`KbwNGW??6h75$Tz^~vlV5SoQ&TkFoSd9}dZjk9!H_>9oCq5TyAIFo
zQG>d%+4&PbIl$+kq9Qbic*fswmzlp*Rfz}-{}?OvKhl(4JDW#jgnvuk`5fJBe9<rz
zGbT=q0pzgBYG>|hT7l;l7C~!);OkNP<OEdW4b`lo1749Sq>Fu&*_bXNLclJdN-|Z!
z{f_FI)e6ZqoB6A5X|xe^w>zQ6+P*O6c4*oD_3I_&+~I?}ccqr34s(M-sRX(7Ym;hP
z!y+%UV%@y9bh~?Fy)y2K`6*(4;t!-7(_<5NeY5BnQ%UPC$>iq`?|C=1bKH3L2=n{b
zuO9>1_>}jpt$zRjcev)@=-87a?5j@7X=z|Uzs1}6m&SQd<p43!)~0v?q(buXHjuNR
zb)Ty9qAR#TDFN*n|H-lp!p)8EyPSdoJ_YZEE-bC*+hV<MGeRTOb^nzZXx4L8vn|Zc
z7nV+@(r;-6#{|dwf0V_L*6nl{z%$Iv`I=rC57edimwWM4Ll$Y+2Usd9HrC(amhuv>
zPK_(cOHwS*N086^7*WF+3i!)+##I6sOR+3KO2jMl4N{PkqaY{eL5K1{K(*YvPEJkq
z3E;j$`YoQTJ0I>wPEJk=osg_M{TNT99`!hTNI}WrxkUHvOd4Ei%J$;riEL!3v}EpW
zi@_@o-65k+RM>F8h)y~Q+Cc>pH~W8<LiXL--TqGe_*yElCjfSyCs+7LHwgXDpOwGJ
za31}Vx8h4^NXT37wKXeh4%ZvYHruNs%3|d9Bb1K*8^Cm!1Pi*}0G)fSHh@6*^^7!f
zVSc`d?B1oD9f#^Zr8rA|85vBsZ~4xSRc~{&-R}OhPGa-2r~~>(7cttKQ{CNoX3fA<
z+tTu)Lz|?Q(!fMxg&<l_05S}?r+(7)pp6=|?j^ZqsxFdoa{iT0n+dQBNM;aGQQ=^L
z8GtiE-r?N0*=4%-P2SgS0bxIvQce~rH|Zed*ML$6v5H@H5pq+_%ehNh5%5V>U%pU-
zbth3M-48n%q(92{nckk<dPq(8kIAq9wXJ;9w$c5x&p00<`<SJ)P%@qezpviBsPRBa
zsYRi!w73{kV$w0%&`(0R4tbJ-l2SU+*VlKfxQjaCZ4)}q*}gGM?9Ls+R}>6uKYrYK
z-b>FP3s@Un$f%HbWnHc#I7?MS10bJalV4hciH&S2`fAT>xzYo@I?oyqG30(CWU?RO
zp#_U6FkG8LJCJ|A^=AeODmZ9(0WM;@aD`{7@I{CPbU^B#blQ~)y8?DzV-cfR?Y$99
z`5}nl8I0*qT0V3rk$gV`149nSGflBa{0VHzQB8@0HY&WDe%__Z(WUkA6XllKg@q?8
zL+lS9c9k?&x2>Xr0t7#Lg2e(I!SoC8{30t6K0ZD*jc6+Md37UI78KRxa+?|PYeMc1
z)h{TKm~Nv0QfOl2#n>)mu%;<6rk5)s5hDbD=!RWAOo~|LOn0&y^R;!$k?yPHJk>=T
zEdCkS(}4N*Gq==^#(=^pMVG!SS)O+RU1PU^8p;>E$2Og<TpdbNjV$7k=GT;D#DoQ5
zTTLQPdk=!wuhpKZZv_Plo>%(;jRAD2(oAig$6ZRc7dc-qHY|T7uo)Fi=&rDtY0Lnk
z`H@yPsL}v#dIIR|`QWTj2R&(N7^cRULU&3NM3@<5C6ia|SH5sB*jY&l@16XCrVf%K
zV|ZXdX3VZq$27q6CC1Mn<M1jju8*%TLru;@=!U?BbuU1$j6&G3g7;p?T6<d?2qOK!
zQwZ?Bm=wYcva?PDvjO3ES6O*|taJ%<xA-*Dwo+)bfg9eXSq}=RsHwSgGYVDh`bQl(
zr6DymxY^k=%?JEkTpHIE=ThHO?CuK)Utd(6_}rqyrTlI+xpmIeY^m0hhLZ&I$q8`q
z-eKeQhm?w%$iP7T>ADjuqkd#jk>ooYRoZZ1P=Ecl=J>l{ijR<dyu1b;fV6qIOdpz>
zE&^c4ewf%o*p>JM96teq3C%!eH5o$-ey?K)T5PAElYwWhs2jb?$zc<E=IMD1CWKQV
zoSgg$D@ywL^PDfsKq4@4=3JX3TD1&7K+{iuALyaZGge`3Ij1lW4h0yCE?l@^%S>S7
z{@Oh~E2t;8I|S#IyWg8PX^chgTZ>%~)#Ib10hdn`aDVela2v@<Nu5Q9_WR}sv%6kw
zt_~p%cWF#Yt`5uO$CysbcV7a3(h-;`G%-#N4zF3sT=eeTzD>%C^;`PWO|EI7-0Mn7
z2s8={L$Mo>cgo1-g{>3W(9r1Ov1ZHo#|R6vNHm6KUG1X!^o{qE5s1$iV`)Xj51A$k
z6cl<=Qn?J3I#~ttL7p|73`%P$5fKyw8=D}ZnwXGWy}CDCkhrk8_(X)r`xPA()y}@-
zgH-Y4N1xxhCpXTjst%m&&XN3gwSl0=qZRuC8Qsjx3^T=`)XU2DGMJQeW1$D>hy8T~
z5_!MfMQh)=hMz$*geC@hbDasoA=elwiHYYH76hKP5!}|4mA#OF#lph!;Zt_Dh75}8
zV%YCs7cY`(BHobAzQMs~)6Oh`h$d4|>rmb0EhPpGK0wBm_Jqplq!p^DU#UazTqDE|
zD$rpQxwp15ln<<;nMEqoRg8Qt7-Y&ph?mp*aooXOw>DE(x4Vi`;aGE6u(}R6a49R(
zLDzE=rodu={f=d0fg-kEHFwo^4N`~vQ)q<4Wz>j4rF`V*zOJ%$H*){@Hx*@E)MIoc
zp^M#FwWr$}j6oWODF=H?!f)Pi%gS`!udeQRVAYLcVxo-DQ{hk=gIIHL*gqskcg<m`
ze~c?K?UHYo-(;_>t{xjd*3u;Bx+!^AezQYP_5vjpJ}`Dy(sk6u0`ZmGo17i0MK<q0
zEC(e>0PjYSAT#k}kXlyj>m`S8gRkJyQhR4-NLbiyUS9O!4#@!YOup<f5w}`sVQ3l5
zfTUbHZCW~7TH_$##P3y|KX)!pxL)`qngwy=LI{bmyues)DTFhhpc@c=A-JXN@jIW&
z!MDQ9{F*5q;qULylmeLI!fIc}W#UUT@4zMn$Ds61<hO4I9JZA@<N{PwPTtjD+uFVn
z#8tR2EG>mo3H}5l09vN7*bb=KcLC0tnwroK&vphp19rCa9`o$-a==A(U&1qHY7ZXZ
zS|w44H0-Uvy21ekpGj_(i58PPu5v{)P!;2rin)|SmZT(?6~&qt2c5dP%N}>#vyRO#
zm3wb~l+8z>vh<h(#(mUSH4ZFs;lHf3kO*Y1xn;W^>!oKl)7vv(Uhwpk0(CWCL)mhy
z|I6e^;E=_0yYjz0<Xb%2S*g_0qTUI~v7IIPq2y#r|7*=uk4GwO^u4@J7P}IgA_j+s
z7=+(am@?iXBO@~{8-+ZQ#!pE}>Ez@D{j2T$mpDe=_}p4$0T_1ZrNI&xK&>tgZdzc!
zfm`ae!;n2?a1`2rG8{Ju94cVEkO!3H<OmlRDPLg$d89tbx<F`t`0xRw&T#Tu(t3L0
zyuAHmW5R-hWE7ycUsK@4Tmn(p-p)>1yfuVkAoD&Me=JZ~vV_)SB_@FH*TOnqS%*C@
zJI7D{=1g6)LevkCF-8qW$<C@Qe*XM9Cm7HgJaHJc@M9v(VnE8Fez^rnH#X{hVPW_t
zJ{Feu?~LJ5(P;zt5NMp=Mnp6`&CbgUvXF}a8(#m&NGy#-caj?0-Dgyf?sy$jTY@V%
zZst|(PMF|Ul&Hcs6QL8R8CAQ?Ua~_fmZ*c_1O9ybS?Y`8toL{Qb+=BI+5X}8@(}Do
zBA<?rcD8D3>vn!xH_XmQy)y;>1D<YVcQUI|;_c4Pk3Z<VkUV^re9r=&6i>{}a89?k
zw=W;PXZeOf4C(%sWB6+qcyzw_3;7=Zjr{j-jdk_gtM!{sIb<d-t}vH5vLye?rl-{W
z%C>P(5=$(;ipg=jaJ906vn>7+je>uR6DXp0b2|W3T2ug%6P_5hY1?*wa9#I3YAF1g
zbSSJ^)JMoTYke85=;q+&uE?|U*nvK}*&hz?9A3>szL*mEZ3wP04aIACX9G*Lu1viN
zZ$_c6wPS5@y$HYnO^WGjZEYPNukt+F2Xz#E&m3spF5e`XzOm>KFUDU`Sn4kY(~(WO
zfzwklLU#b{N`Tu+C0KaYC;cp3U0q8I(Q|X!WzW4ci0OV^wGCm{`1HC~+IKAaG9L9P
zI4BSZAu5Tm<jPbIZ~?f{CkeSPGHE@DD%hoJ6iTNPc>iUMy(FIMJI_3%J(76u5SK!#
zYe&#Cp$mQ338TbNRJVAdCyET43bZS3-nfxQib@^sZfmPoSj>D;KR9^J-)Z4%H1wju
zCp>2tT8f{Uni@S-R_<wNID6S=|DLjPYp&(<=K%wn3JU&A2~)brt@@Ax1202EjmE=x
z;icfiFOq6k|0BGU=VsL_w-bp|nOmi2(z_&Gc+{-;1nF`a578M{Jl1NKnweR70o~Qr
z(Lp6_7nnrtg$kH{`7-cf0y!>uuP%Q#GC=<LKNgthqJ2H}WLIK4hI)G?RU6!Y1(*Ic
zz@#awwrppcylU%{libSq_;C_m3>eqDkG|dg4}+_S?6@u=A#w9&-uw4w##Ob>zvfw=
zsq=qbuScG^zl$*j&GgeJuC1_6MrIZkDdZdJG)ej#)$FjUDpw3oz}YGmCMIJ81C`Hd
z434r_CS>pXZ&+AZJlE{F5?<eF+^-^Zh1|(dkf9l$PMrRdhlhug5ntG{oJo;r)T?y#
z$;?NeqO@ma<>fEqcH^R>NdU_Sn_lWHt>=jlj0(xiyID;XDt<_f1#7M>#_&>Q5xkPn
zP-{w739(`9cK3g=2VH4qKMoVTc4HGpAu0I@@I$Hsf`etgG1lP7;!Ql84)sBx!j-7!
zC2rj!>_acxA~P?_^f5?_r-t8@QCy^!R6^lXv)*-Z;8*$pykE4_H#y0EA<eL5!BZP1
z>@Bc_WbxNSZ<q}dhEZ~bioFk-o0}__D>4L+z-wZBfv}s#S(4!)A+i`ozE1Y#4Z>_<
z@u1X|0PIZvbmH(Jrv3Qyt-<{=3qd`r@@iNA6ie7<%n*0Rrx%{Hjq2?`5gp9T?ze9h
zzZv+wHdg!6*w^>+omlQvUgMVK{-21;moB9<oGNTU<`_12#BeqVRwpOZ@JU^vbo`x9
zeYF!FF!hd`-{OU<x5pKSllmSJ*bD((L1XQQh$N+@eE}d39JGmv)j+Y26zT(^t)uln
zKAqDxx*VF!1mY8|e}uvREjOI(-ufivo2si5BP0~!f2@mKzGc|B($;q0apn0jpL<{0
zB3N!$`Z5&2M!6+p=ismn97rRwjQBSAzc%1#HFZKe1e%f`9?i<);&8eg4jmocq5u_$
zwves2H#fnKDkvb(y#MjzN8px9nY!Mi!3Q!lFyIEVyLw|&@F6od7D$NykyS4|I;DTe
z|2%C@1e)`FrS-J?{N7e#m2-t`)+hZsL(2n8gb+$Q!wNk32cOH!9d&e)Vq!>{DPByz
zmpWg@Wor(unG4`#9WOR+1p~!mcQOvc&i1zZ$#D%((xs)P1qB5_N%K?ni-TDRL~*5f
z-c0Bnfm41)=VD}#qS~nbTZ$uB@;`Q`Kg)Q<+1c5_?Zb2=g9Xjsjn+VJZQ4agSG&5M
z3}-7lw_n#FwwhXTS`6v=Y;7MMz0ex2FYrvUCzQ&J#4SVfDLOeS>h+1{K&hFs6)!)(
z4w&v?Q6eLWx2PlP>gpuvg+P%>5)&15J9R^SwszZ`ZwEcR_Znnb9@X&W@853~qMuc|
zY+WBbf{qC2Q#VjkK0G1&->V03qX||T(&(uQveJ$7DEY6hKV^n|(})!JFnuw-2HJus
z;Dwf^eNJtqq&urj>>U+FJWnh8rJg>`Eo&cw?h5viot+)1I)M3L7Y2@92z+Mu8Nn00
zibg~HhCplzn)jm`8qz>J1vU^^v0<S81^D?161~08ik{EjokQDh-nJeW5*5AwUw(g3
z5;oU|{y(eu1Sy=pH}tLlX{NSdcH=rmUM+}P>C<zmu%>plQzuLXXoOH|!Ey+RWNUM?
z=&`>333#9Yxg~*$35px60>mC1uTrq>fH;jdmX(!-mN|?@^fvg9K8_3woH_;ppB3w=
zs+NMboR^n3Jw5%vm6nzk;v3|7-=T@oQE)9w9sZ&vlK%I;gM;U<gT$@b?GTZF$IU5*
zYnnuA7A+T^CPQ|O^imMVox0BY&Jc8_2Wy+>?nX<!!5Q{x%{X3@uW&x%1q+d0C!X+d
zyQz_prNEto3p_>KV;rbDMjA+na;d`hS&@+g0I^Or+66nPT{X~Ef%hI_a{9|lpta{<
zOHY+}hh-IT@Cc>H8g?cM;{ViDQ854~d404S=hHw*Bm{7UVUreC^TB9w2J!WeTpWk0
zMqt4?+8<Mv2xa~Qum8RBr>5D0{Hc%e)7#&h_;^d$lhc!u=;^<`Nu0=>@zj2w!xo#W
z@4`2~Wg_nRK&NIrn#AUATUS@X7rCXSrQo_!wVQ1n>~PBZr#%f!O&5YS^z?oY7qn(8
zP1(MD3dT@u?3*W!HwiYFm?W&m9Zt`8fZcaA35NL=m9CKG`31|PyMN*UoC2pkMl51=
zbtE`EJpU(0>?<`?_4p%I)%y>lpRqNZV4qq`zh<sSFK6F<y4>(9uh3!oYf)9H4EPSC
zlnV%L>hQnU^!VEwa6D~n{oEz`{U7kP$AwW>9iZmRYCS&cCLzb+{AT-2Zj2?AC|1IL
z5gm7{LsfM>Hzy}Xd@zaCwM;a-v!Nf1HXEdzeqj3HWMUHas_ly5WEW0yz)F8)>v%`N
zW8d%aP?A=;-wb@AzM@WLtbbM&TlkHa1f^QrtX<_$G7Z>H)7?SHU^T@$63=Sy*eH2@
z@d#IKXdzSVXB2(jt96eSw>8a<ql1d5osf#jttIXkGoBzeGLc-n7F=1;I95ug5U!UH
zzb7e`Hu$uQk8eOJ1u1SMYXj6x|BpumgoH1w#Mb_LehnJPBM|quLv|W`S*fXSLpa_Q
zuhg>Luvb+T{l||RKo<^8#0AWG`X5ibyeA=!s+^sdM+!TBPT|PmOOuhTZ-U_$Z8LN;
zw%iiG#$(moYVvIpVaz-AzNcLsETEP$G08l97}`4b#CZ(l2!X=?+q!zd-Su<Wy5u7V
zc;mM8YDCJd4^(eib7>%5pq$UO7XJQSOGfIH_Hs@>vo2}P2IC24ROpj|IY8}MI^Eg>
zAIRbQNMY)`caI}zfVW51)PTZu@tpT-xyutvc9oBj14@4n<sTC>^Pi?4FE>b#Z1W2O
zW5cSfSM6Y4ow>lRQTVg0Xqm&6tU`#yC0tN~ZF#835XZw~N5FdggU;zpK%RERPv{hk
zjE#X`9EF|-Xn&xgS5{Za#tb3xf{O!8g)T2KErQC;srSX~7dyYaoa53%3K`U&F(3MW
zdy7kc{+u4jvPK*qs?D|d>AX1b$Diu#6j8f}yl7}h=y4#jzpJK*Y@V63HZ+WgiMh_r
z-9uQp4H&kF=#seGZz~8U@cz);ape~h5|WVc0wo*#&5%ez0RZxS9UiL6dHscr%?VIU
z8ijfSR-*(?S5s3ZylSD>LFDzGRt#hPix=O3)BoOoGwrD@_x1eW?0*Nd2#l@6vH=wl
z(NK-s0dyVN$})5k;^VoI<_joM{%z}%W4<~KB&eLAIMa}kWoKqSQBnB@-vo3qAWM+T
z1i&H*S^v|gPXy~gZk*0WuFj4I1qA{65V(DtR={Eq{7Li^kgvfaL5-xO6^HeM)W*b|
z_)J}09WtEOXDf3U0t3<vIH&*jda_P`vBSZm-UtcW4_Fu++y*D-@$Ouk*YQAYKYFS#
zHS&2*<;w-s0n{4M&V^w}5vDtYy=q?TctOiOHa7NmmZJ@}b#iJ-yWGMNCOn|^0Y5Ml
zjkx%DkSzEF1W1Tz!H=DvKT)VJK|@3HWq)QtRfPxS#>E&RX=x-(xBX9TJ@t}H+x>-5
zzxk#&8OMm3SzCv52Sn+=shIdDY{^JN7M+`4TIx@iiwx?q;NqHw-SOkQeBpw__x_8C
zlmZrIKrpzt>;dyFVBRkaF7e}!@854s*wl3;i;`s}CnZ@~TH;_yfUOTEDH<Ca!H<1f
zM;oAE#BKkashfd4fckN&Xq^R<oFX!x_MapEkL7;OYeVGLBe9DY0|RGP1|Q9B?aiEs
z6xX%&^qlMqJW*ABvMpWG&vtVfhTl{cc6Sj0Ft+Fi3^yotKrdaRrk;eA;N~uT`<8%=
zEEdY-ub)%ZGX8j{UxC^(Q+ERGTER#7_26kxKtZtv?1I#ikvWaXB0xvZ*K2Zea-hMa
zx6$De<mcap+6+^!|MEb+)#Z+y&Rj~rJfb7hJ)14^uWXsUgf+vwgpHV-^l6Nt+{E^w
z+|e@dVT%PFL3=`vi)-R5i49@MFOs?>lJ3-<vf`)jBDWOPKdWJmZE#JjF8bu;Y_x=4
zE=79);Rv{}rMVeop4p6w3PBW=&k;1VfDP6_6oCkNYrVS#rP=NGTA&F)MTiUlf8ZsM
z%!W2OtR9e!w;IeQ?!@YQdPl+r0+LEs9M^=J(Fumje*>9u{VQI-g>i6daHDLlTB-ey
zk9@~h`FjnQg5}q(x@S14xR(8&K1n++1<=<f0#kW4ROINu8FJ5E3k&&7RW+5jqV_G$
z!ygO0j$QN~KD;GSw`s~ou+h|L#KGxh*enXh$LrU#o|c-fd2EK$N%;BuTUuBEOF_hb
zI${Id9axWyRSgW@fBx(QO2=zy!{^W0N|_Xul}|QieT$24H5%^$ou`@m@uLp2qby8d
z41Tyv3}+7oo%l<t+aIG<kl{3<B7poG8zopsZ7O?v-^XxTpMnK8vf$td<;W6jY~kTw
z(Z~aVk3ZK=)5?OXC2B%~tHxjB!#gnvru*dldAYfy;A}qF<-!0;7Z>|nRjPCDBVxp2
zeu0FOL+fE24-X%owvXa75BHQLF_$0p-V0*174Id}X9O6TBtGhU#r;7D|G0z)DKWKV
zvH!Pk8cO=`;77|B)9$}kpti)bT>55L=`OA>?{|5oqk}l+&-jjR&A!2rYel|>b8ZgX
z;!><g95sgiHtMOC6;D%6j*#0yG>q=#=jMWkT+Ny5-m8Cq@IO}G9z;0Rh}%r<kIWv6
z?vukkT`MbOm{Kz6NWm?_WOZ|S%d@jE(#M##Q5^Dsq6dK^l(Z&j!wCGlqvPtzs^GOZ
z97Ks$R%_Z!8Q!OJLTzo`$)Xk$pY{KO34r>8W#&{k*eSM^?pUd(1<@s2={F!DsVJON
zep$BQ)Ki&<m)mW@RcF@o$ZTm&jCOsZV(==JX=jqTMt@@cP2PNAZc$PD`L@inXM0wM
z210{e>t0N8Z!L}m-VXGVxCWCt>P>T4q#Wd>CLQ<SQ^XM)AAFnk8yo4{Sv7F*JYQH?
zC_slOTL~|HL26LsZ+^=|;DD{#>czCvu5AZQl%<OQAf|TLl9S7gJ9x5uXL@zGvZ_)S
z=`kUvuRj4!<<Kr7iMpehm_=%(Fa%sV0qyJu@J>P<SvYi@(UfPqwz=mHD7LSo^SFVI
zk(!3);yI~>duwn6!5Jtvt>5&WunuhYWh*}afQv`x|Lqlza6VSu{Bhya<P2IjPW%0!
zx~{pFFVT5Rw*1jit>`^k8~M@&aQM`~z?j5|Sawd%?h*QEWODBMb*KBWO18E|bGCNH
zdG==7e5J#p&Y$*|c-HEg4<MC+oACDS+c$39fIxzT2D%+IF}`kOfv(_U^T(sXhSB^a
zgqf-7@6{1-p}BRof<qD<ApcWf|5q!6clXA3b<4?X^T!E@2k8L_9Yebpo~GiY$9k$N
zZIN?WW%ELN&8EDO8ei5OUua!!Sif8m3s2twh^*Qn9@jeMmNB)zs=LomJD4@W80#GE
z9~~XxJFWhvi$LUuk>9$*&c3rU#6%soyKQ{(W$x(v04p#%qN3Hnit7^c^74YZi8$IX
z2d5IOD-wAK?H@Qyg3gT2udYtys<H|Q2&Ad(w$gc_nwz1De1?n?z{<cN1>zMHbs%<O
z+Pb8q<bf;rSfF;Mfb#!d$`^hxzkmQ}*;*&>P}M$f5hK{(4)<Zdf&=q~@6aaafU^S*
z0GL|aqhTO*tMH?CsZI;i-@$??8F#yc<N04Vu{Via1;2f@6_#VXF%N@w4p+Z!s5Z%%
z9!3?f_jdO;YjcSo%S?PWKG^>5zBzvzh9b~tjejPGZqqm|_kJa0x)gxY?#=FD)}0b*
z4PW1f$)Qxt@132Bk`kZWz|DP|gIxt&GF@&t0yXKWwsv(@)lU%S03U!`(QSYGMhNB9
z4(uX_&X*0S3CpM%FoOVg0^a1c7HB81oK$qh!BE&DX;IZsszGc~I9%T0)~&ksiL1bZ
z<qs2E?(^!^868(mNZ~Uf6nr1Zx1d$?!=-2>)cpn#HRSOgHS@0{MNL$jDPTMLF^{cv
z!Os4Tw4vw1LQu!jOP3R#^+umYX+zAFN@|MukhWf4*!g`&6!_2|K1iSuM@rMHu$h|*
zC|yJ)YOB1pv8m%_n9VO-_k2`lqg7r~X=~Wgn|5^Uf>V=&Xo_SQ1$!<`%t16mD8o+Z
zLE!?uPy4f#3z#97mzTkm^byFH<{*;OQ4xOge&7jaM{TB&eIp|p&@s4f=cK0_fq5Xl
zdKqj?c^ZY&;K7aGwzIb{#v%0sWDLgue(?U@9#|C=GqwKEjLy*!U~{mi#evvCE1SKK
zKjl<?z#dyI4g`XPL~73xj8rn6;kpWc{RnP3ZWK(GOUlb%XJx%iXhin9fpX2~42YI;
z@C)uQ7#MeVcFJ~k>Ww-8%MWvQ23^49(@zJe_ZLgN=I7Z8LcEI$#V+|&=<admPp0I2
z9}`%SCniCHwuVtUHJXoL8(K;v2jBS<Vp6r1!Y{nYB<Jsy`deSTB+yq^7dM{lVuwBn
z`XU(H6#e9_XpK#e8ejBdzl1IDUjH6YW+Eo44Iwv-Pfi}mi8!uu<bzR!n*x;y^o)#l
z8{gj8*&X^_UA=h8FCnqwrM-40K)9aKuS7O&w^5?>^zQ*lj1Gd$&ZHxT!4E83&|a2-
zB0e)ix9a?3T)?awrp~~P8Cn3T21aOY!LAsTpO@EI&oB+cE-a+<=p=A(yGiIs|1se@
z4}Du%aa|iN2LGwy7)-N~g@MmvYg7*8CBXc%O#pRusvK@8Y$vJ#lR+>FMhj6q3SJa!
zsMu{Cb#SttjsyXYgHhY!IS_ZiUK@7hbe0)9cTm(AM?vxht}Fp=K~SWorY^dE;|7-K
zG}vS4ic4U;0bB*>TXT7!ZksvAk~UB{@xdKkPErin{<Zy&?ted|z6)sJP&ros?XhF*
zsvK@xdrm)&#ho|nsRCy+I7%@#-X!bu+Z^jAa|iLa3mw$Pme7&w%o`|H4a<xy-l-f^
zJ>|`tw~g;7>Vzi7$1kiNFbfFWS?As?FXO+2%M%+Jd9Z|2FGNDTUgy)?T37A9r{{i1
zMjUDCd)ax(cDMOO8X1WVq`?4ZcXu(AsIc%;Rn;wUyzrdW^iE5gg{%#8z%XCpw7vW&
zpaEXEcTVAqo15FRGxQ<`bJTsiLy!igIQjYcfs8Af`vbNikii5%5!otr`^6*z&I25r
zsC(t_PF;fQHw|Dsz}A?9?dHwRy~Dx5I<7t7N5KuKLr;JNLK1dfI?Waq9@yNu0Iti+
zr!W{KEZ<d2u&sdM{PYA~Cl?5B^=;buF)`BKjurbLS;P^11&~8eC0xv$!t3vVwgK6J
zB_7(YH<lD+3Bkb~z+efdn>7QmN=-n(W8M!Vhk`nl(4936g+7Ec4+F7q>VE*a=SW80
zuooAR$d0P;NJC1>D;K{j3_YFy&3gxA0NudWWK$&kDz*oP-X468G;(8qc1lVIsoX=w
zS8_k`o&w9zNZ`vfi;dP3PY!}RDY}PtpWY+vWwW|+m^402Hq(^kAP#nSdC)M$6&w!J
zB6r|+jwE4wQ2(NnlYgwP;xsrpI>Jx@7_Ckq(ZYDHfuW(Hxp^+sSzwOgb^t}8s$;O(
zdVxJ-yS}C6(zpvuv&D%qzV-y?5&kL_|8ww|osZiCe)hwYf<GMRfmaudj>O@7?e2a7
znP|iC%5R8h;S-OKl<29fhg5NVZmm@4(EHyZYrWk%_2SdC%7F?`XEhEJm15ME^c$y+
z+}E0cI9+37Pai(K+J_ctrrhP*7G-As0Ip&R3W_h%e;gBma36~dC?!1s&frzxngA+U
zx)N-w!bb3WwSuYDo#P2lM-+ej89-Mf8)BRO{JBO)=kt-h1!i3ibLgT>mB=D29RAGY
z$vqdmq&FT8rKrM-WB=(=px=afAVc!<k`i8Gu-k$6cf9xfs89$RVhQ{b-CbSH2|$n3
zSImw=pMr&h<Llj%EV=_m&1I0)CG;OX!a#T|Pl^f&K^}h{$ivG^w1S6u4tLI+lWAo6
z_>eSUr}2$(a<WFOu(^9BTV*9pDzAyI;jDcGV(C6X^kgtu!;kS$7R+%)@#&_qYoHHX
z%VExBhMNr42#|#0;3_^o%#JxyphE>T{iKa&VL@Q^O&JYON9iv6bcJw+2L^tJU^6me
zBz^+cZ~<V7sl*UPDr~^}8<{?3I_;X%iB1u<uv#p_^7$Y{_1t<L_uYqGISxVC*mh<;
z`$EFjZ>B*CTD;H0m-LZCf}g*o-%f;mx}4Hm@<soL<L>U#f&QEN$e^H2e>`2Cl4mv#
zLIhK}RaJ8RPblW+FF+;(qc2|IPF*Ol(5Gbn!-r6`ZPva*R|KBAUu$DqP~gG81tdyB
zVxsXZ+{psuiV3_&^^<&cFeRrisR#jU05(B47!|FlIfBi$Et%Eg(ni8v5a(vPdwR@L
zs0x4fGr|i6jAKL9iIQ;)bcS0q;xF@BQUbm|6$aguy1Y>bGk%~hR96G=5ZPiAkbyxZ
zMkXf15jdpA&G6&9fYDzJFlAaFFK2&N&4>>5@b&SDyW0Uh9>a71LBiEB)8@n}9I}h3
z14!DZ{Q$uQ2HjI&PMvojDIl<#5HFxz{*{4~vlMkkhf6~}Cj12EQo&0^#BJi@GPXK$
z{<f8n-NIL!d-paor!fDAuJ4Y=djJ1!$lj|&Au>afEkagyAuBtxjI7JvJ0i-?j%*_9
zitN3!bID%WWOKi+&iD7d&;5A(?(@faJWh3VU7yc;yk5`sl4iVgF{PEAoed*6^hm&N
zCIN=SgWo?0e_6<@_PML8#^2C_8ryms$UD%f@Vxq7Z@x}R^*uekDv#+)CiL<EjongN
zQqoV@tnvIqjh;X$e~gR-T&T>f_vSSO;CS2H+n%R;BXC{VJ+go6ygIxF>$JJ~7_cHT
zW+f9N(<U8wDR6~^U*~N|fC3HRaI#O|pacchI4Cd>TyG?NwXpvL?H6nx8TSLmaO>7B
z%`K1{1|nt?r-AK&zBc#&eF40pvk$Cw!CK-Jv;BtuZR50zOx@wp9v~(WU%!$BML!$(
zG8PH|KLgCL#ii|R%lQC*Ed8C!x#;%JVi6`L>^K7X_-sW!zCQ&~{omoaxQNuxG&?2o
z6+~@PZ7|40C?=GBD&BlQ*NX`9$EdJ=5DIU%1-|{SA@eX;VRayn)6#+j1r1PlzGwKe
z&IwQ=>`m(H*Fk*vFcI$iJvhN4%opsXk@VAD7CgZ7!9(CT@4HajtyCC@N9oE%jQxR2
zA8jY#j19}Hgmnp}J6Pp{s}vLYRE|K53Cbc(At%FT>c=*Qh8MR2;~BOC38+KsaFYJU
zIsDah%&R;#<Gq@%Sz@+<`VtuEs$ZM=a#F}@cd4F6$*II(zQ;0x%g47s=dG|XZES2T
zRAa(+M&aeWM@NFqPw(8zb14Y?r}I+xhLu}T@G9!P%6tIc)q9<AhfP2<LJiJWd>6E8
z6y7Kt<=(t-(|mK=uLWNQnKu<>PcT0tP2t&m=T)bDNKJvTq4!#lD#UhtNi4$r3BRCV
z6-0tTF?2)Blg29bCz@w_NkbO`3;db0&p+oVy4&qY`xMF#08O;1X#t?84!;FfvQw_#
z5dNCM7<5M;IXKAk;m(+_fp@@r^=uLb%IvHc2ZvR$4vD7#C~wc<2+e@%;NTBPIY@8=
z4s6WL+ZK1ietSm(Hw}qFCLg&9XzqprKn2Pk2xKB42!GuAwCn)NEX)w+p-li*;ptC{
z*{`~RZ5?{3-@|2PWpLnu--=wv>+H~MbQ=T<z$XHc+>E?H1O9LqDB?s#MZpb^4ewQ1
z8VNE>XYhstDsy;zYz2n@dGCvtNAsZZa)kZ^z{e(a{P2pB5}!>d0zANQ1PTX}hVPrO
zR2<<#gTj!WvGmzwmJ}LdDMu%DummMQbOkpT*WUJvbM-u3=ihN~p}l=P37QtvtxqiD
znF5|CU2kclL7t%%h!Au8qX75Sjop|S@&j*ZY9k_gpu_ovl5!e_;s>f52?+@{4%Fr=
zgfGaPkVtnp#2^o90K|ZgW9tO|Ub8iS8iM(ivcYsP&{^Si_h)0BM@N?=QID*;6M~`q
zUl9ZL{#Q~;{oQXq(^SbRVqr)X#ZXoMVX*a~3b8c|><Tr1;l_GxiDKfy?FS9dNw!QQ
z@vspm5WUfxJYn);=-ZpLg{~VL07Qz3)xqj`;Blk|W};`$u0C+v-2ibs@O|A~;76T<
zQq>VuRnX_YtcLv#g)fMT{3Ym(zOMr&Y7I(RUu+MLQ)tQrkg$L9^Y0|i0fy@9>k9{s
zpCVl2He;o?Z^iyk2{(?%j;h22x+Paw*#ACv74ew}eOM#wZ;MoSH<j6ceZf=F%so-s
z9-k5uobzI<E!tQAl(gt|s=UkW{jxXy%i+q<F))#;YHD{DJVEsS{@1UE?0Hl?W)a%u
z9jc6*&2?0vqO)ro8=x<|^2$`&8(?7+P^M5bsmSCzJ30p31=Ak-T!`F(i`XztDva`<
zBh|Nh>YLkDK2Nwb;9SJNc9)KaX9rLu`1*;^7fm%&T!lEgXprF>kUc`Hi6bH+268q2
z`c0p^#lv9_wZz`;E~?fe50F0qCo@Buo7;dwyAnXNdh{4|dR@cW34Nt)qsw<L&WvNw
zsJM7qe%Oo;FA1SMMJTJ0fE~;QWT)xrq7F-&(Qn=y)%&$-6gqgE9{E;RYiBhj9?!`I
ze>?4<mvnuJLjJ1zk`c>=>NAD4zf|(!gNC+t0<1vTH<Vm@C(DD`!v#9PcQOeH9l<RO
zTqdvqV^Be)^+E3)Bv$9LBdxEmudYs^O-Q>mQLzD64rIvufR@vBZ-x7*kliEYZ%)et
zFMfUZf!jh|O%3g9NJ?^9jslhgU7U~Oao*SU1BEJ_F_~!#V%PrOpyd(TOejm-6Y}7F
zSyG$ym7Mq)g8m!aMslX6A8KpAzU`&Oqa5Yb7cF=q*?mYtqT15Zc;J54m-e8v6ivsL
z+A-|b9CS8zl0uA$m?RS9gsC0qE{~z@E7)*fYDq}}MeiSP;{pZH-NnIM7?<kd^n@r3
zFp0v`4>P!H8_d5$%3urU1v*Cp6O#|_09wO6AfXT1X15rN{&{QvZzMyEWH`AEo%gD%
z+p&yHtK-^uq0LmGLQJ#odv=tU;K|{R=OrJIsp~;}f~QA;&*g$gMcbSSMkHH*c6R&_
zAjOhILc|iBJy5m+<&p?7YbGXTKrMsy39jB7g0D^5BI$udNfB|`4k2R(0ejjM&`|Jy
zNHKy<BAM4pLsK&@Ffj1_`}b*S^vAC=Mn^|MRa!ra)@Z#Q<%S#<sO_J8{@*{nB85#v
z+wQfo*;8kCgdN+xA2c<cAs$^^?bU+K%@eJyK_oUn^115*DGm#Du{#k8hXRPJG8e>{
z7#afd>qm3*3y_Mz9lo%z@W6G~+Zc{=5hf-~POv6+LbToY5{TmC0p}|yUyhHD0ke;l
zdktmWsMX2A7FwPMPHt!g(q{gDACWh7KZf-b$`74t1dpw}sx@Z7>PYY2hFD(lCPh$&
z#zOwW{H?^8U2XY1es;9h8+vpo;51ZK4JCdtywsJEF@t9C;2aX*yuH2Qpk0`s=Kx<k
zpa4)m_!~pp%zWogK0HjcS<50t*x4FD4dBZnH=!_suCE;AS}${hcOWwWa5f+qxOsS@
zGufJb5JkFzc6%#|kf{;RV{P1WhyBi*IopGGAr-m~yMxl<nEw+6fxajR|6UX;q&5#8
zoEGVm5#T%QZ2M0|wTMMUO=o4jV7l$Wb8n!roU?teXLifK0ZBwlss$NYa2G;!&C-e`
zPQd$fZV3QnczM@4V}Ag?0cRuV&0SnvXvLV`A<idc0iKbQt3NabW`UhO>`QRxyvD7M
zAhWNmtOQJ4&4d(#@tIRG_*<GDxu0K?-8ZNMjBqE3ilOlxnx;HEfy<ughEQ4C(ZRt1
z+(aFnonb{+aRWf{1(4ND(ajiK0Ta=bqO`QFU-f=)?OIROK9`gG{PSBRI;Dv|6Osqi
zOjC2P|Jl(0{l=jB7OZNpo>Z`Zz5O_h_|%>=Ht77;XHFJR-krmH+}w6vCnWb?YAS0Q
zS&Rr1*_~7r>1(Um9>l~z@WRbAi1x_L%+Je1RoSmXD#&A*Tg01Fwc!M~xL-<^Ii@$R
zv!F6F)x_#r8Byna;34;LcZbjJbaG%<xeuU1%UfEayb}}x%<R=pYq&Uf_Mh*=H4Y*b
zP)hlme4tza&pp^C0=wj$g&crqf-(qzoRA{_P-rYlVJia4Myh`f4ZkiP@h_jtSfNE;
z(1HX61Y~D3M?4F4Y1|VAPf6RCyu4M&7XTCzqElTVoMK&%n(yUkotId|rK?=Wab4%v
z|Bblv67r@9lxzV0+d*R)p=20genIeUW7O3L%g-^I*`M_g&<Io%ZwjX0S#vZ>-wQ~;
z5@C+KUzbVpyA%h<u)2B}8k!x1ufErfI;S<v_P_-8WTAOevx@ZM&4Q+0L}x${7T+tO
zH&6{r12zI%(0=8|XSi(PJ(XJybu>0+z$t=@>&0*X`SWM<&u~n~)qK_jSWoTA6KLIg
zIy#=HtH1bp0K9PJ-RO7kh^a+CWo6MTgzy`Hlt%~_3l!NB{qbtsJ3CRhB1s-EF?3|Y
zuLqnIyL@L_Sge9G0$Xt41@gwBxDqkZnehWsj>aL-;sd8oB3RmbX!R@PaY%h6&6}&|
zP^NDzESSD})y?St4JwJ5nHhk2=;okLTnJuA89&$#yu)3C7a&1HBJm*^7q5eC9DZK}
z+#NCPHg%^h)<*KwQsEiKmKUQkh>6AA*hL}Mqot(<^ubWDV*9cX{8OU|Rb6-e4pI&e
zr!}F|J*(V=1mJ;Zg9ytx`M9*JqVuNZ>AX5Wkb(U3_1Q}C!s@D3@`D-(e{yZUbTwJG
zK(|j_YM=Z;e+NPGX^KR2o;qA*=e)!=92^C3PC!~Jl%%r|`v5E+R5Iw&{9SZ3D6$S9
zp|nj8u%_p_x_}&Em0Lr=AyhuLxk!L-;cNk>9=eG)2(Qy4bxBEMP?Z7ySp)Gfc0@?`
z!+ZI{LPFJ0a6or8UTi1{UWDOCkm=*-w!1X_`}ZAo_SM<h%h4uHZlJlYkOIUD%)Vn2
z6VxNT+r^N2n8THXR-(dgg-*a4>>_DtY1!Fk7S6xzyake=i@xOsr?A8dpv{nLP`_2E
zU5<;w3xlZ81>hHmmEq%{Eqjq3N4s(g3Q(j&O)WUgpsKkK<q%x_yKr~|vJSZH_2hV1
z&9@|Ex1oduL7D#@c!Qku^oC%4hZ~G*O=k&?4HnL8<?-=ja6%I^?{Ee}hmtM}dQMdp
zP<SKYN&rL%U{KX<$e!Kb-*2sL!yGoCXJRs4>Pu5+MLnknQ~W_oNny4_me@bYGMX7g
zcO?}+L8cCANag1J{_B&9r>7}vqtoltCCvi^x(-B1S?*g4wzz9+&!nVl6NN`O`T3Pq
zReQ$A6&z^u+hQB1W^P=&W^Q8QUt6of5)&QW+0!GPLh)S3(Q)sVYR0pb>1m%r9$)qJ
z%Dz~sltlJRGijeb1&X>>F^EA713Jp?O#kv$V8OJJxP4Mol%b`iD@0R4B{iE=QBe_;
z+;Gzl=ldyLJQN>(_>u*1z<h&dVDtbgEQo~!bUy@-YyeabIL7)VMpq>6ZA`Nh`qO+O
zs0z5rAhmP^&+S>EE{;XeOG#(IBe8-Wk%BzmhloL<jZcAE$Q58J-5<~_gVyfkbbm@x
zLL&7O7~Q=a8+*52-GvfD>(*n1aS*IePM$&&HF`zx(OjRWyBLI28Gt2RKo6S-98_u=
z8ekqkmA(i|NdfJ17Brc*ws$`qx}QPkQK<JVl;*3=6sYH>VRLP+{rssbEG!J>%;2)b
z#EBTzr++r4AhS;_Lo+f9sG4Bqu{Kg-;y!RuLiiWdnlvLIJNW?WuQ@|_3SgE#*_@Vy
zzdVS5I&h$c|K>ZtQOQ}bAAwkC2E1-f-iY+{^y1HK%{AX-4!$VIYZl*E+d8gN9b$Y8
zotLac0m;S$?SPxhBis2GxL^F0TliLU3efRsXB<D&wluDbAQIlZd6T_(vH!XCp{U2p
zp&v25DO26kCZSjIA`!}Q#Yb?B*89}}d|O`)dse%}4GB08SXBE7z-9SJhU|gkZiCn9
z-ZKCKd&HPufm9W%dU=Vn@k7Z+RdZ9a>SC#nTpXg=<VU+1(%}*>u@50hbJx6IhIy;d
zNEUfMQasb`?l0ZIW*8EQf}Wu90;q&~NV<Owfl*?#6yn80Lp$Kch?hHqO9m7!b#PIS
zj%u>p=M(}<N7FFurhzB!&{Tnva$~XvSkqvH>!06LWMqEkg6YT8K;TLZXo>;ac@3Sz
zLP9-97JHX3D?7XKu3b5Lw|6J-R^CnP2Fma%*BPyw*(GJEebLS7>cV7l4mKPu?I@HQ
zo5Vn&8jCi$R4`?h$I#;tS9!cw<_r{B@A)6!?l+RleDde_#J7?2DxR&K4{Q1#kRLa)
zN8C&ox&+M@Q1OCO!aTh=DyphJq^Tx)dMKc(MzqSavTo;J(o>97orlanf<<B=Z;5UC
zo)=r$z8s~8HX#r{b+?#8r3TJDsLYd003OELfg=EHeSoGiGBCgqKnn|I)WY@;Ha`BF
zurR+b7{BksBawZ$0KGk&vHGCP?TNy!9|iF9N{w#<h7{25T)!n%_CX2*bv~R?@vaFM
z6sYfgr}_+C2t4aZ(3OpYrs$6Ej_qogvy;W`XfjRT)Isy)!-Sm&?ogs-2AK2DoYtn!
zsOWyBBXTB*{^nZjAEUD4P)m}jdM3NIP?(>u@|35%IL_M2N}N~<gTBnTHI4n*x|Ed5
zpt(r7wFbWw631%w`Bl!JkPp)|QZ&~4@teodk8FLh!<3)LtuozE#b3H%y;30W>#pbk
zjYeN~&`so*FFd>Y5bcWq!5uuPmmqh6jTMKJY_UWp=}cLD+-Drfi2!~4?nh{Ch&+Y?
zcG_F&pABN7yn0n~WDGJHp;xUE)ga$s@|~F+_(5q?dCTv$Uq34wTdnIJ?wjj`#E`k^
zD}oDEAch3?C7c<8Ao?udRCj~s&8rdU$YTj@35oO;#??6}@@?-wa6eFymBo%gL$BYU
z-BVK|rxY*wwF7NCNQL_Dsu)G;-F3;m2%R4nE32xyde%p$i{C+*3HKJ>tH1?uK4#`@
zsIMTsSSU}g)I<&fx<E*Pvn^#U;DdpM)kse?Ly&#NcI4t^Ou(UrMn~i5ysT7!6#Db$
zCWPa3VI>EKV!x)71gsA3iH6UZ!_aAv!?NEY5)(Uz!w|qn5DsH*<9jL%U0P&k$yC|`
zUY)ya7&?{&e^=nj&|+gh1QfM2UlZt>VIAD^J&C%MfSX;<M`DA1*^~X&P1NG!p<+*v
zRjIRixkAooCn1D{8kq<@xW2hqmS~A>tgXk-WsrhdU^bMrZ)$cUc~n(fn^juc<#U<c
zYoLwxZ%la~2>>EN5u#;g^}bj72KP2Mv9Yl$Jx_&9gcV6CD6S+MY~re5b8;2}d=d^o
z)%wq>KF7d#K<E0#t`Nit*riuh3GP~1$?&CntN&8CGQ>OE8agxiNCl~rFh0|5`bE6T
z=4T~@rzuZvq5WGg5&@UGat<^WUodSj|G~`{?`(Ay68Q*D&|3e<^LjrlTSmzK%Ga%W
z$9BG$Ef1<=$QtyjSDO~N!DEV#U=ZNDgNu*vZ_EQ+@2y)#JxPMVAi}1bDzg}bgh<2B
zfX>}Vr!T7s8Z_8$`rfp%u<!}+-EHUs@i<%(4fM)Cc+It5Z3P<BDqZ27d2ZiI-NUAW
zEgs4r11vVxN<;++_~YZL#HD$Ik^u|24G=6)`w%ZeNFzYJ0MU%BA|f;iU@`{Tb({3i
zC1G6L&rgvc62u|!F3*u09R+I##2%c{p#YMD;5<KkXh3~WH`dpc*4RfWSWtk7$dZX~
zNk>W_Ff>yWC*OM^u0;hLg1yOI`=elz!TCAYql3y*VXxU4f^Ty+Q=yEEuJ4@dEL>gf
zv4}oQ)m(L56MI98ibVJ^K730iH1*@VSRZxZO^zhJuhx^e$*-5FrjRaAwY86+g$JaE
zje{fLBd5bBaZ=Esr)8uhB(OnAVqkC~c^m-Ap+X7`J6l`tlhj+?2PNqUeiHJH4+}vu
z-P7im#y;MAfy=TAKSulnB5(~wgYQ@J3kdE-@Gs2+@ktLdur`Cc4=+It_W0W@K`zcM
z+?#kur=ax!jouvy9KpSyL_$Hq3&H9FM@Rb`%J7Gx4Mt!q0t2O?u70HiqY=;=Qveic
zkhTVBAe8b72FB?4xWsc%i_leo)}apxxg&5<n_s()%F`(DeHsdxY?VvEy@;o=vVYP<
zBJ7?#Ayre1e5`O9)A!BIIZ|ik23$DK=TJ*T`ozXw#X!J&F!%!WWidA2CfsGV8a_s7
z-enmMj|O*^R;V|(8`q&>^gB^$0BK+9t(+@PvB0nVMhQtuGz|NkK)vDu(KWV5z5^Qh
znit=@y13Lcfyl+D3m|Li+`wu&vEKlYR^JeHzcx_J517*>rHC0cAA0`^xebPeznD6b
zYI^#uS(oOEm-$7xxd(IY;57}ZugAh7SUKO2X`=S}fy~TwXb!#nlFw?_yxgo8l_W;{
zdNHCE5PoWpAIPt0osch2P(XnART|1Y;C08wsN4<O-`(z$3Hr#ro0%r>3Ue5Md4`}r
z%CHyK)`U(MpIm$nk!T3WZy-3D>YFe4OnS82H<b(=CEQzy`^HjR*0)$t9NloFHMlSJ
z(cgW-mRkjr2%t`v6`*+vzzCos7am~lU;d1_tO%irDk>Lx{WFN>Y(X&qG7zYdF}bcZ
zUy%$nCaYCST`Jcfyucke1j1(l*bl&xF*7s!aXh_9o{C|9fnq3Wz!Q!VXgD7707?w<
z)9-?SNGnDfI}|*x(NI<<(nx}F2`MQszr^+zPrSnBBG9-3e*j*p0$XxIE)ZY!svrfz
zc@i=m8T6?^8ZHUz(c+8alJgg_)<=qHlN7$^)NewN6>MSfXTS!qhsz^Q#BqiE&ZUe%
ziHGm&BMqfSDS&<qO16hC+no9Sh3iCA1{nr~dnRYS-*1hYK)Ctn2dHF5apBS)m4@?i
zbO+u)NKnofQ@mKV0x26fvu<V*;o|xx6;rz)5GqJu7$T+r+0aj$sZ)ajq}C=Dceac_
zl{t)*TU|Zd8;MM@e#$?uYOjrMP?MRbka85HGtLf4RtBl38+}VtdB@^NsH5<nt1ra?
z5(4bNlMFH+o`=lf$Rn;^Hn^0E0gKo#g3J8!?hElcLyEh&kQ==u2N^7|E-xRA0nCA>
z_y-0qd2R0P!HcdQfgkdVv@hNo^_HT!CL|S6ai1;)WW97by!N4CVbcS57#SZaQc_UV
z>qFoR+mQiSHfikO+XJXyj9sd_WB49uw}hxH`Nxmx2*xb{+JhAi_rOqLOlYc7Mt_bS
z(vURtl21OB$Vt!u9nYu$#W*pclZx}ji#`d=qn#Zc*YGii@HH`TG%*HhZKox25^xD+
zF3P8wu!JV~v(XV8(q9Z15s4)n62E@^@?ly|LUEW{VwyS_=OXL=xU+mUy;`DqXNAMT
zQZkj8Wesk3SXVhz#x&cq^72n#`EmHljYdaOX=M6}H)yNjlvfb0WHmfEQ;`SD8e~|&
zhYk3<1%F_L^84OzpuK~-;CF&W)zoINAQp7XCKPeG@2x+fGu0@s#oHH-^PeS=jt&kF
zk*w^PWOuo_eQLQ7+tBhtx$g!Y3RwA|FL)#)1FBBQRCfj=CjbqwK>$^wBch|GTZE`Y
zwDFN}6`(c{y8?;uij9tE*BNeqOro0#lZIA$D#%3aP-pp!qN0?zICC>ItRQ8NM*R^c
z(1evrNlOoVx*|nEmyN~{3JP?*YT&j7^r94cis7Oov+t46YQ;xJs_YnF>{=cvngU$g
ztIn21kdBTHDvTb}e{L2S>cJq(P?m;BOk4#cJjo+Z<16+fMedh;d}2Q-+UMuG5=xyp
zh@D{NOvvmdlR=rGv@|xDDvg8-6-Z(z^Y!dg*tobh08s>z6aXanJm6+!y>llskP4*Q
z$*?JRACJBK16?j;YJ!~Z@~VcK8h{_()lZ$j8Ia`b*yrKq-+Y``wKwv{8&RZV_7!9+
zm|i31<l?j`PZt&v!5tMTrdRBl!6NN`Vt@NK1NlZoARtmp+%z`;cS^mL#H(O!UHI`M
z3Lu=%EHeZ<_9o>TmwSl1c@L&&3Ee5-&WGk3rrJao7u)LaV*!bMDEW5>nzL_xh>ZL_
zDC55VQx34n^S4501YBGoW~`{Kt@vtPZgLkP?&?*=ay|S*R1}~KRoAzyM>coxaz3|H
zQ!PU_==p2<VhDZH*9DR92q!fs=Zv#bwgGzaJ#Zf-YNm$2OPqMF?(eFS&<Sw{Srq6M
zZ_#}>xaT#ARz_;wF@14c{-O8VW~0-dX;4~R@&0ti-=y>CQCV@ZcS=e<{PJ6ppYPu*
zFnw`x;;MJ4uDt|5cHI9lCA)B)Y~)+|wpRJ$eHL>sI<x54bsz4Az{D|Z57Q~7SCcg8
z^3QHO91(kOj5dr7QCSuDo6W*dC__jT*SGBctJ|pb^yG!!5n)7MLA_a88cXepOn2w1
z>$@beyPMl(7KB4@6TYl)1&M#?``$RPtDHg#17J3%h;6o2cU%C>I`4Ky0%Y>a^lM4(
z%GyTV><wDlg3SA5e?MQ`4+o0XQ!j!Ts-M|4&6S2L+uz2kr3lhI4=2C8A@$9<KB6<%
z!x;b7nh<mu0|IN(NCuwAOjy~WJN>UaH20Z0@z=YihB+jJa9`cfeWm<h17Y9t6T}ZY
zo0SJ=r)I3IA%{E6oo&0fruqs!RPYXtFt1S;v2zl_<d4G6t~8o;Fz(E@UZ%Vg_o&4w
zxA&V1M5vIHn*zIF0@=K3zm2vdgqXdOx$Hss2g-FP;6KZz4zlojR3St_#dD`NR~!D`
zS)1BfMa*Xts_J`h$Ty~3TLGhzRwVGtb0mib4ejUG(;FKm5dX#4c#}_U=!a?6!Y8s5
z2W@dVEq?B4DWR1K#g+=k!Ic+k54XSMKj9O9v%z}p{zde{$h^^K-E?f|+CR$Pe(6Ny
zc2?Kom*dZmVMi7`(g}0+Q0=w|2#|XEX;fXKw5%+UiOKUC-%Ia|j6>!1kpltL=2XY~
z!sGoE#mCjRm`5tkF1(z|NPvziEHso;t2DE+GVZ!-HbDQW#A2-hpqYf015>q#>cP@E
zp5_wFg8{wqHT*yTs6aF7>9Tl!L7LYFeZM#P<6zR$*K2F%!W@P=eQo<q58J{&1MKFU
z=J&^;)02}+sq;K+;9|SEb2iu<ERcfLpU?KkFwA17d}zGd$1#H{^5lL^%=5qJ>3}tK
znm0b^i7?S=#iTIT#ZbLsEb^TRFQfDVVYi_0My)+_$AlZ4!iE0+mG7R2`R1IkGPz{T
zbQ7GtH2yL%!oNGjuCFy#YeMh1{TvCkqmb>dhX==yNSpt>W*G9~fVl+3advSLuoq={
zdC1`ZQdudGZwqoQC|E&l_$DZb`N4x5CrS5giI)L-fb=Md*w}8<M(twGd=@%76}P(|
zL1G7a!&V(Jtnk3XLqg)Aoq{6+6Ws52XxT%Zp#Zn02SgALR}@4%-$kt_6Y|<rFXL>>
zF6Mr0Cz$%aazE^3ufSuMAmMUtTU#XP0qW&KuKQjTaF|WuHIu}kLWZnI%ryAEnrsky
zxFt=DVd0#L;P>fG33-E`-edN{aHGEN;{zPa&7l<g(lLMU3l~YBOfBVKx$?-AN%e_D
zu_lI?Sa!~lw36>jLBU_QlD3<h1O(3ymblId(EYjyt*DW|mbP|aLV{5&rx65G6loe7
z8S}h#SFYhxq-ss1B-?QszaKPF@!)7L+DZ*+h1$a0f_56NB}x(!pLUR7YHDla;^9HY
z9Z*K*Aj$>F`K2w-tr@8&Pu}_Xh(n4QOIB>`NECw%+M9m>YH&!12AHfC*k5m32|i72
zo>FU7)u`Ou6{xqN<xh-@%gM`ggP(Bs#Pm>8_3wR#mzI$co19EaW?zFdqnNxhHe|u3
z%5y;bY)`+?%j>dLk$QAWG{8%;vdhci`J(iF@M@kv?+pr)k$>#;wLJNOX;<--n@ze$
z{Z-P}&J7qf&^VfuIP%@VbBliM79G%IaQM~Q@CIxmY~M0rxI@3cp=x^1PVi!L`xYg~
z4C9+O!Sb;yZFdoOxy=X-jdwECrYgm01RjdGK8ktP`L2-D@P*X1>u2oL=2byh(`$+l
z%;!O9jx6CWd4bpZJ&JPSExFA)$mG=3pK6WW6c_)9K@XQVul){3NJy-IfYjOKp>-%?
zaCUZTc~$4!>u=u-=m8Xlv5}E1931Kpr#v_a;GZU74PEC!O%THEKGD#S4nQejH(Og<
z#p@tf0@f429q@~#W#h#|F$Wf#Tp&H!+3}VW*dB<8ynOi*B3-wk+=l|_U_$r5i!^}=
zwV9QEm3;2*H6*^YnE(ZZzVTd>nBxyuS5<BA>{R}AsF{C>DZ1MrSnYhbg@wJdDBlDJ
zPoDIV#a?XmmApZ7NK}=_!jGJ+=+%4p&|7<QaOZaj*+u!-9)RerpA58C3Wz~8)6}tO
z1;Yxljloz+o{YrJ8jggQ{B~+O6~%;qE8OuLEU0i*?Z@Wk#E{UOcu|Pc21PtDK;v>#
zG$}abGk^9cCX(xq?_Qf?76(AGP<U}`)<2+m-$8@2b&*yypCK`q&2FEw+F5=_CX|2i
zD}`*r`yqV1uqzm<f7YM-e*IH=y#KbXqvPcG_8L>jej<MeCT6FX4Dw59DJ)J9Ykbc!
zjC%8?3#uzB{1#w^Yd{SKZ6rvX)3q}pAb^lo6yJUw^8G<VgvRvU_LXF1&EX1wY7HDC
zjOzN3T&tFun`>!eLVoosT1EqSn_|NK&szV3K1tjcN5TPpg<Yev*6aLpWu?iHLqPyu
zHZU3{-Pd_wXb4sifs2b23>E3?*UHQpU7W0ep*o-QG`h>b^7-sEM%<7o(kH@d1^!k9
z(bqpPaGQq*3zvlW7e9Z>eXlc9%`%hrHxJx~Ngr{~aj~&|>PH+LHV+q0!ic-oi3+0V
z68s67{a+QIb&#(juv{@@Ds>b}xUY>3bL3x!Hc~v&2`kjun30i~LFV<xElkLCC#MMM
z(AI7+A^qk`a=dRpu(d@~-qW*?IWz>3Pdx*r_hx!-^z6)$M?^H$dL%W8of8|kl9~vZ
z8?_Nj=p?dmGI8pk)mA6bM8)iSybr(&d753DJ@f;6i4y6l2r62So)EGmd|CNzcx80p
zQauYIN;y2<fCc)qczUTZs5c|s9VdNeYH}W4+3HBh(b2H;I)aU*?Yyo@2V5T@{iZ`f
z2L?&}=vaX^o;xd(HT$5JoZc;kP68mhR{)_+O?mwzK)=$BwuAu24-u14N1dLW0N)Gx
z2DG<?r2tw*!00ko&@UdW_#nGBAW;BJ1|VDrD-G-3{ttRVnT|8VQH|KjU0UMrP0@$h
z5xMbWpML&y1BP5c@P?@E_xk?+p+@8Kk`k#3>yDL`Q$JtA^)Y+b?gYvB_%vWQ{5MQ2
zE&Cu{0q$Z7>Q(36rTM|^q5yvyN;0w!QOni_22(})MIOh$1N<#}aCSk)C1{@nIVQj%
zxcwgO&x~{?UZJ#Eo%Q)hhYHIcRrq^7h$mbm-)O-mxV^huo1G#;e;rXrj-m5x-|eWI
zIs!40&+91?r5u5MB~J|T-6eKnG;VWmzKi?rFA-a%>9y_>b3W>ru;2(LlNTXPk1pXa
zPVn~(BnyWOfkg{Z=PtLjN~T};+O#K5b-!^<c4sR)l;7FKh03AUW-{~__-AGhy5bAi
z*iOAd<xu2X3ogfe7UTC@Q|T&z`g&`r$AAK-$xSg&ks+M`o{-O<Sr5Jd&qmh{8%*h=
zhzn3#pv(XOO&2O%xC)?20qj303DD{Ah=}37zTl7$nCM0_T3%UM3DX}`RY?v+DE!u-
zg@A50(glTd0peA=#tAK44GEF^;-1ffQp9Mk2{2bcPOo%*c%Lj}+#(<^FUi2bEG4y!
zcU2c#qVLBK!d*yv-J_(GrdxddI)RYz6UYT23QtXy`<ty{Yn45JbhOx1^}MLn@c7T4
zCqYDS$3}lE%F9Ez8;jQK<{zI4J38*wI0;794XSRElg~|kqjua`$Y^V;#3G7#XGkdA
zW`+8vX|nR3nY?Cg-4sfZgu%}r^+EdNzQMq`fJ1Qcmsv&(jNtu^5bY70o(DU@*h&&g
z5z*d+pl&3&Dx;=GJW+A3JW}+W;gjO*Bdb+Za>h>y2^xwNv6L^^3$Rri8j>7b1hc;{
z_6jbCkcF&{=<N6{d7pfqQH-pQ&Mht74h?<my1)K7V{3Rw748{(Oky^!B4|=7E3XHS
z?q?uco0|n8vct{osdFaGTH&|;iCx>>+FAhw3gB1pWO-OvmcV}m0%L=59@8!~BBrT1
z3Xmn-v){qhjUxC9RYEZZ^2IjTYTTL?UgsNn`)F=AIx0#$dl2?p{rAMx|G|SE8AEW8
zk$OjmlJULx!ECG5Wr7qDEoxKGgGfu0GA3^BS6fDn-`t)xxLj&-YsNLUTFu%(HKRux
zqP{Wc>G_~gBUd?-?ztaau-|FFpHQ5YwF8^U1|_1Pw;LbLER0We3(E7*6)clVC}IJd
z_Vxu!{8uw&zgR9V{Og&z=Z#$F-2dPz!xx%s#5M0UH2H7tFJ`K%3-$)>?2IH1{)pM!
zna$+o7342rm1AmaTO1f@YJHnjRxM<AE=sRCGRjCrl{7K(_NLtc9wqS)tHM}LJ7hvo
z(9`3AEDYWbOAX4!@|`&oCD+5(5t|Gg?Env8P@s@P33kcDBame=ZQVufMZV!g@SjpK
zVKb-d*G#h#5v?>9X7#}GPi}5GfGuEj5zIwP^Qp%s2)|NWTZ2!q_*gzqA@1(>lIr4o
zr!y9Yh^!(3=!DiwUvS29uE7Aeh~VI%eyXppvveLmK1vo&5=H^!H=NHaC^)5*G|I>^
z8Qww_DB0Fv6rO2f_I$FgP@f2lJj!x7FeH;5wRINV?AA2oto)_p*PF{T9)lPDX*^&7
z(=|6^?0OFl2rSD*2N&qz2qo{Z2#BxT=jUIvuvJpiytgSZm#OznukPdM*EKhsNSEH5
zMr-36(SQE<=dIS|B93D0_jP20cNUyXnOaN4b3sw(*_f0YmgkzI&NjWSG&=5?Vt@SS
z+&YedLy|hQqN?g+O^ub`%p}XDix*=*DLVtUphVnIHxj*YdLozxuk-v3qUgERSL4><
zDKeY?+xwq<Mexkbrm(Y8r6HV&6RzpL+Oymb4vtMvZ}{+Gu*Nx7vua>(p<8mYTCS#&
zCL%;`ckjU+uA=V7#=-JD0bP2hwbJ1}6+d3*H6d6x9OB|Ai@_ElL5|Uy5%3^RHu_bQ
z+64qk-_ft_9YvgcQsgrscW_nv5w<p=+|ru;_^}h5MAJ<n2{)KB1^bnnCw`fBKqI%*
z5FSnXXJ~D5xEtN#U(cQl-v6d7HOpByJCmXQ{$7Rx`Os}0DWrH+Ru**!F||BCO`Dwi
ziR^A+?RIx!Yt&-ty@(`m)zrz<A07m^z7>=V8^VMcEz@q=D#X?DNr|Hb#8mAe9U9`b
zas00L?Kbo#FTY{vmeC2R@b}Lwb1<{tk-L5SMV&4KfG=?0fDXF+RU(C7eW&TUs~Ifa
zKlAc1W5w)@dZxRV>#jo%GAc^XV!3ETZ#2u$kPA5OjTxiH?(S3JnpmYpImOeN_4S~c
zNpbUSTO(f#Rg6~JcvBMp3xe&8WIt`6M@0`hyiTw&<i@Lgn<*6={q{TdJEBlYKE2n3
zwDDDyLff}^+jDQa;iopTQBcGqku!7Jb)G6^lH*y3x-M3Z{JE1zwibD6I#lT8e6($$
z?r{ytMvQN$pPiP3!+QGTKe_azPkfODs(C52W+!V$k?-dp-CLwT&Nw@3`aMQK(cIh|
z^03yR{_T$QZ#rFFfspiDJsb{8wzL8&8;UP&Y$85feaF9~rY^|e0!Ui;X<vy2`_?){
z#{nO7aB%I;aM3EwC|y{y(Y^SS6XYGrE#X#S0BqeQJ3H9m331%I1vFA0O-;qSb`w18
zP44*oDZxbOnLnDbauq$?TBp0Oh-9r?p&gS)I&VxnIkiMY$ji~&9ECa#EFCkQ%fa_V
z+e_9WoET6h8$Nl34)UdzL;L-82Zw?_O;<326NI(jzfY?FmXuVhT&Sn`ds4@lK3SNI
zjjgFA97C|yZnCM_PLIiJFEko6!*e&--t7b{wQ#bgF1z*3v@D|0c{a*4y`V`)tt`5#
zYCTP|a8Cq}An1aa3t4_(X|*r?R`lD1BG^!v&^yX>ALfP~Q606u#rfu@{IVKxm68%-
zgcV_U55``svo@Z%sHNyz_aDe5{!kJ+THYyL^}Pfn^P$cL0u-+h52VF{x`9Ew;p+>s
z5fVk>4NPJY2lI1?{yE;KRb_u9xP9)+b?=@@Q?S%O*J0Q|L{kmNHQ^74gUIM>qDeb%
zoG+VO*tOtXzJFh!(RBp>CkNvr*~hPa<42I$1j0$B&MG8@+G4aASoVRG8L~=eBL%;^
z6pY61^{`BlYiJ~STROPvY$QiW=wLmYcse4C(!HaRmA85r6T{^o6dn`=^|FN__p3g-
ztdt)KQzQJ}M-k8M>?*w<W^5n;e5cu+myZY!mwOIZXvhXRIerS%&`_FyYwm8m!u4<s
zEiGUIVE!FDC+ArsE{I-1%m==kADaes59Q?AL0<q3LbbzksYyF&&VH=(8Ic-%kWrfY
zA*aYXs}pP6@};F%OG|rIq%ii;5gAZ<RQ%QmJG<qL7?^shsag+8rSoJ7p?kKZojqsZ
z33Ef2<HVU#;#-9&C+<dlsT)y4UELvr+0-3^CfHF49dbo_r*+2zj-_Tqb}QsERpD=f
zIdsnW#cuiX$X=6rLfSgS`cZ@)+E;%)hPDa_k^AM(FQXmxm2yKUBcU;>6LibY3JSiF
z#9pqhe-imw#l;JwnJi8R1W@E}f`fCmW?vD~*y<t?LV}Ay(T)2#3o!_o2D1s23(S`Q
z%|xG0`Ij%zk&)gC2D2g}Q@6W^hF~gFG$>3Rgp!+$`YoFoE<DdZKW=5<OFV51-wF-I
z1uzXLdN_490d$>aeE04hl+;FQJ7`{B0z9CQPMl+=+%D?jdN;n{x>(Xt`+iWE*V>ei
zuv(qB^eo_<UdMbC=-UXxLRAI1x;QVdw0my_wU;JI&@7UdymR8xFj|>r^sI31Cn**t
z;atAv;Mhf+DP(-~1&hN?pQV1kXiTY!l4*YhtW++~b(+@AsNs2N&eMfFNiUPX?x6`w
zN;=W05RtPxtv^j*K{;&ds1Lb(F<W(5VKg#IFG{65j&3{`NV?XBHd}x+>9CET0RRx7
zjUa0UruaB*pkO|r9jLJYNBV}?O3%q@8y!W{gHWY{w^>O^3G{}3640PHgX|Wda5fee
z$N*D<dJ05`(!$V&8gTTvK^+xG0~w-f6;J^G7YFIAZ*F#6==xA)&(U6NR)Di-p)u@g
z5f+BuD9zwyZ5$OQq%?JbmRAPu7VXRA8%I^{@o|DXZbJ9o?UcDZeSNkWbI|((<K^Xf
z%+FuT=YO`=I)+83V(923!iu>T^M!}GO;D;x4<@CO;o?5BSl(wIn&#>{S)SI*96Nm4
zoak$_zn;+=@jN70V6r@KPB~WrPB_690hiL}hK9@aei4CzdA@!`e^!}Cau8sWwt=D^
z5=<cWOo7QBjH7hZ2IH`20JI1X4&L0^5%xH`&j^GvTE7KJ_cJp_P$j||fw@WGv4o}o
z%1crWU8uG3Ng<fg)+Sd}#STtjVn<6-jo;9RgY_zL{(nkJtl*05#e^*RZk+VY`ZqtK
z>3&q?G3i~O?@}MKuY{u1bvU=Zx7~fQ$GCELbtM8|BHP#)a$3>fAXjC$c+nFa4^Kat
zJI0A4aqtTRfmkggWI^qHv76nWt_<)G&d(Q0PaZ0^ATtZL@%=HI$>>{eo-_JWS4oX;
zrOE+UD48s`fXQQ#aPjQ0+VT9$`qAdzE5;b10h*Kg8O7d?j>vZj5PxT4$X)dDWq2rm
zws5Ui-P@qnEa#0&ukGtT=Os?ouF-Q%?VsQNPdf{In~z{|4hXLSG{-6e_}LE<En(Cg
z$cX6uVBj`%q+o*ra1sKF6l}L)XC)>kLgEqpk6>+o*n0IxaxNEK_n`7;*U0Z18Ui;Y
z(2P|uW)!rE|M$VkkTp@)!knFS8lmLadg7v7wwxS;d)F?_`E(WL<jl>_GuJt~zjRW4
zar+MG=bQ9zR5d>6YqgV7eqan9Sm9#EBYXfP6-UAC%b0Cz6J}g=bPo&G=jT^{*&1|G
z<`oxT_3<GR^+dAX9{N^DyL}e@7iFU>xESjEfSjCowuPkkquqdzP*R)OP*K5|tf#2i
zZc(-}b+4ABE=`H;^nQaCk2&!of$b(G%Tq}|#PQcOiE78$7`od-!)_~ey(w%cx-JpG
zQy$8IiJ8pZ{bWy5KVwGm_(Txo6JzlGb8~Uy;}4sgZ%0KjgYjhNoOA>1HQ;zEUIl$;
z!!2;ks#pspTfcfmVE+&aDIacoD~>?OZ7x#rn0lK+MzAcTW=~B`4G;5)it0i1@nb&`
zFhTRGw=^|XR#nwofUbNQ&I7b~;L^XItMQB?I~$vWt*yuYy0M{QdsqB;7^Nch16Qbw
zF7fbudC$&2H%I<as9aLHS3+D|h?SK?RJ6Ojy{p5LFu&4jq~B`f=4(4Uq8qt$3ttC1
zEP<ZagWHvscFOlk^kVN+L$~4`t$W5BD95>{>pww6az{hG!us^dQ~mZf1<$<>NHPnk
zmX<qE2J9TYo-tuPZ>~`<M(oFOCAswn2KvWq@yz8D{Qi^oETs7EEe1b>dxB{{ehh`Q
zN7uEo^>#AQ+z@1N8#H$;(o?N>kxFu9rm-+QQ`=gAgL<i7a%#jmHmbjAWkOeM5R&3&
zW(IZ^{nGT|n0}vdbo=gRz1TUTy`9&y^%x`(FpaJcUu7NmW0>n@Zf>3=V0Vee4Vp_L
zIfHFjj|M-08iC3yA~Y2KOf?Li0sI&M%*sjxh^;E`$3eJE^27f6X^04s9cncyNiBp1
z)M<6Ndtw6ZwE*+*3G%-<+w1#GsC4-c`4CF87k2eoZ*mR(*Q8kIjo)L5tHUlg1%6$S
z0HvlS8CjV1WX#aeG1ir;HIxTPKp*h%+`o0psr}vB=g;D_Vy56(V&@y)fH;e6Wql$X
zyQ^emD}yqY1B`BhReHz!MZ~0}xh~^LC#@A$BZgOFpITV(Ka!FX?iyTJdxDe{qn!W1
zlXiJT(VOpU(0^D&izKM`)E}we&I?TpGk8Yg&?BJh^|SbUo7a2La2!5=u1xd5mG$0s
zStTO-sHRQPr8O~(rj|N_^{B?#4qDrvNEa39PtF-L1bPo3f(*nLxfMr1BZF%Zg7$$U
zg?tQDN5_f)P8(m_96*&Y&Hhq#F&JG{%xxV(QUUQ;;9r&KiHnPa_fk_?iPprzoenC0
zrZ#~^@Q1*(<!SJ%Oi#mz`qYRBC3}zmfGP9lD*pjfEKM?vU+^~e*}g|2%j(3E2r$4N
z+mk{h==2hjs{VzhXno3N&dG^`+15M1$EaCZm($Wd@|foLv>#GZI{j=hwE-2;-l|Ny
z-#7OaHLMpe;-A;hSQ4Ce3*zY3ICXYdE}otKX=!;XG?4QPo+y^S!kccEfZF~zlmyYI
zj<1gPaMwk)SO8yiRxZnaS1L)olhs%>k+o<bpsqm2-+>&xTmkMdC>gzU$(^8u*80fS
z$Rv97CCi7Dx}IQ6u~7ab^xX{c|CPCZJ5PsUuz&(eZuVzspy0_#QU<TEu#UI?FJwwo
zN>mg*gxs^RV7mYtL?=#xJqG$5E6kgIf>Hn|Eu$@yS^TS`d;j20=tothmH$|4%tC?5
z>d1lPMzZ14?3M3ebQJSeVrI<L(xL(^nY4$D4)y0x3m>06j2axPJOAyrAAxYW0s?c7
zvlCWesp7=EZV|oB`QH%^+=0T+-(Mnus`s(kj=0ge(3>K2ll;b<6F+pFIy`iT7EY`+
zq1UFK-Zl4C_=m;*yZ(p<!&>wdDd%QcIbVpTOe(7bJ#2A%Tm=a>-o@pry0c%FVy)a~
zRA=(Bh1aMGKIgl4H4auHqHRZ>GCjC``?JjyEAPx?;l(YzZ{t6jU+ma~?zcyahEcvM
z$aOtDp1E;#->8qy7&v!H<F^TT+L{7DO2P;fZ>8+kCPs5}{IgXqd%)s6Ji>wXIyziS
z5cPmK1U8rvxV`J3SNroe-7nbuea&HMe0*+&x?|a?eaHP7v9m1%I#mN45U+86m!<yD
zE}YkXVFih7AY<l}m3L<Fnj;;p*w}Df6*21`C@GHnkfM*BhmC{d6i3qhAFL;J5_0Wb
zx8l4_R6ReQJTWbcLgI$<4``RYM}&ns4GgJ|)lH?P{u*gLm|rvlEV37lgnQdD!NGK4
zd#k2xot;qe@;T;8nwZROVoHx#F?%Bt<CkBdUaD(qZg&P?V{aZF{2Z;`JLrt9y?^~`
z;^}JkQ$4*1t?u|Q>yQzBdVDUf9X2!h`u=s@f>b(~SS1-1Wd}p(Q=5P>i-^FoMv07m
ztLf@VUN0$;tE~;1pD(w#|MBL7YH!tz(GvGLSzb-32#v3D`eWm6E63k+TGQTvk*G+o
z6wkRf?c8ylid(;b|G7r3JX+Qsh)+achA0_?jt&rQU9EN)?nFeeN2kxXOvnFC^{=Q1
zz%Od?>uJy?YH1z5YK&T&2nLTYKGnU<0-dcIkikGfQSQjTLdVUms;5^AzbFRE%ZN_|
z%uKSN@vWEqRPTSKO^zx4m%k)L!2Z+lhtXX_H~d5m*zdU8+4Ibvh|`4T*%&>I&es;F
z3TR8>73-pKxVFmlpLc>Lw$UHq7&xy!lFXs2W3?RKgIE0ORp30caOgsceFWE>iK*$C
zPt7wJR-brGgnKgt9-=n_JVKt+5_nHSs5@lO3U-VpwN$r?DA*P5a1_YW+stVH82Dqi
zXXA2kQv`cxpi}})qKa#i{}s%ACAGmgBzw4JEm&D;zOZl<&HR=Xda!a*|9~s6UhP|0
z<xyXIbBF7^SwctvQ~}B<wWykXd=UD8@F4xK0tWtdg9@~ak}4|2nR6qO_;lTP|J(2T
zg@vvD635b+sCw}tt3Z=gJfr^?S9E)B)mbB>U%foB`uzO+walk|FE_WiFI}cpNc>9r
z-yVk~lSBm>6@ee>+<9@vV>c%{Yu@+BFVCYFzHY7(|5o{$(A3t#8*#`c*Lw1ViDNQv
z_{lo-+=I&^8PcD>?&(>09Domv$MfX%e;Z1Wg#uq7bPsrY8M5>Q`>6UZ_I3<EIyPDL
zLMrZ$il3_5i05B!(>goee|><|R9An=$tn7p<9gMtt5szkKO8)!YM;RL(E=w&R|$!C
z=`#vuivgf7-;mwvV!fLyh<-rV|309y90cB<tIMzaX2MUZ=0qV4ohH;dtPf+$RJWD}
zT#g=?e$LBJGHN~C1OzID1iNEnX&v@fz!<$t!QKVwnd>9ULIMKNLz%3NUkMF;ZSYQ2
z_3eWPT&0@7f$LdWwtyZXD>w9gTwIGuDL%e*d|W<o`Z`~`%=awWOD$`8%s3O{UjeGr
z|Mk^yW@yFR2yvc%P5^_wW>(*j9;8a|Q4*#wP5cXuJMEPkkDcm2Kxk%lWw+o;-d06M
zX{5t^mv<a}$~mzGmulk1hIK;d9KScG&ZD~9s*gfi&g-~1LY?jV;}CcdH16^1`yqqO
zFsk;3%bC0E%lwqA`5u4&82Q*gE2}y%anN(5kRw@$0MctmK5%>pB0Q{hkB>=m2gR3~
zWTSc_9|tR!S)tw<<qeOXvBTh$sWk#>kkJWQSP=zey?ZyK^Yu>u4;rOOA|Ur*a_Ued
z{i)N&SDCF(2aV0guk<V|ZZp$2bxtb3>HpWaO8mL4*jtM`ScS(n<`(F@!QL?OS0ls0
zjOwd;@nuZ?{o3rcM?&DNxK4DH`G^uH)R|`hvT{ChJX1>&cr~thHI`s!fx~NiZf~L@
zwz4v}tn9-#k7LgU|0tOV0UYez53#g{R#w3EZtrgxV-h5gU3VSr<+?9)^h$Fyd9tQr
zeah=kH}yHhH?+29!_3`xcdCp)S#a^P$yXa^#^CK0nZ7(wj>W`oKyEZHxlQPc*20@O
zF28FL3EddsU#;n3O>OJL+0E};M!~^?!pQd087eudloj@4_&GGJ2(k!v0^p;-ZCSdl
zaPuahUFh+^Wv{b7zfsT^c%hI~xx3IUb9TllEgg*`@leUSEO|0%zOw*i#VYE_Zf7Ta
zv?<ijibUh?%S8tTnOX_UJokvpny-FKTL&d-9CC>#TjAo?hskn0nnbw}XLSm<6*ZYF
zS1iak5~?~QuF&9wIOvNEnKlIf&u0ZrfgZ3x5N|?|_jonEtj<6wRN-s5<Qt}$B>=az
zyx}&!$A0+|AvPYK>5F;CHMUyw{;%8HxQgBMiF`NQ{&Z#A=sO-%nD?uhnK6T?4MHi}
zyO6;C&HG_-iFmqeKYk3cEJZSJd*ox+mE?~}X?|Y}C>_hU7^DFw=kMiZ42|CA=I0zJ
z!c2s4D&pxseHWZi&follg1EFw{gtxyD{a%fynerM(uJcq6BGT}xVIKSI&XNis}9n(
zWZ^IfmUmrJnQxDNhV)o1$`LT{h-n@ha{>Lwy?Y5+S@V~%Iw196y!l1d`ym>pFZiVo
zJZab&7_xG6m)2{?QJzy16|9DadM2axg??DQ;}aX8Nv*M&vfRF5!$}$TIom8F|IX}s
zEoDZ+zdvD;kfSBoj3coL35Q2UY*rWUHv!YfteDVba&PkIPX$HAiE6=f-v54T9Mx`@
zT8~b7S=p??<;Rabez_*U-&54CrWSB9H1yQd6W>5R-rhD=RNP}$qPfjFCZ}C3eE9mp
zx6DisViNf4s+KrsE6spe7>;5H3`(<p@l501?CewV1@IASWQ<W!K^oz%QnLSqYf+i#
ze;K{!&|lvM&5Q4;F}x%?y1wCVi(vQ%vV0AnK|%*8gvW=w4~zQW%$&BbK1Y4Sx5k8O
zL+`Uu>7j}L+}E=sRqr1cW}yDIioK(yQimX$BmVh=<!222QwGOps{4gF5(-La&X8$%
zxDz|Q*p=jc`1P4Ytp~OI&kA(LbmtyU{q~yr`N`6-;_pc>A-S=%Joe9<=D>wFU96c8
zxZBdTE%2Mz+c!plch&S>JdovSgUbL!fgscP?lqA0TDr!AV;4+qk-rufuTmDywm?IA
z(ZqWNjGJ`5MDm~yGhi))bpw24jP&uk4TN1XNZYz%!WjB}$fC9^y0mtOiz}!AqP?eQ
zXRTI~$2l#ew>W!Q%_uZpqdq^1{DY&*4HjK#XPUk(XP;F(U2&%kJ!@kg=s$Kxa7nIS
zRafQa=1GK!$Xy^v{fheJghNXUmJnNGW3&I>T!pa^7N?MKABPka+f;lnH6i^MQu=Z>
zz|{dFVu;F!j*Z3oQv=xm5G}>8eSMTG|1~&!Axb1J=G{AOSni-b@BX$6U<oMr_W?qI
z!u_tG`++Sy3SbHoNMW8BI%*KwDp|in$iXRu^n^aT2#jG?6nIiNIXOX-3Pquh93+E5
zrU5_%WqY2lyzXb9!F-i`?Q%<auCFF>=e+*)8h%zg%IoW=NN4z<Gh1JnnX$97*6ZWo
zgn&~w;iaywZhd_{6hAhV_rcf&vLD%NR8->e5OQ3+2%!Rt;9%uR(*%pc<6Q9#!)?HW
zA|aZr9GsIFyS=Hgm^ja$PeL;cHd*M}Bzhn)5mehJcTk{7M8lA<pVb*E_d%ORx3;<}
z3}IYfQRQp+=%<>HY&Yuw*9xcZ=-!&n^hQjh6ycvg=hZQwMEO{4_{B<0y{EacF}~fz
zsjSTs!>|5$slT&3@!VQ}#$(l3T!|G=bnm9Gi1w2ko84d%i~J)49L-}5bq%WM7RBOG
ze?~Dg{Fo1gQKUA25fR&{tE)#EqLq@+`M9|u5Z}`Q0ac2-d(kO1AfOpQbZQn{Mi-cV
z{%Z(4gO@IT8eE3m1!-%xAb|z?X&iDC%dJMx*+f0SPIKrXuZg&T%w~Fe8VtCA8xn*y
zUkVGPFzZgZ(n2@}BX;lx2ys}Rii=Of4~JPwk2tmfEqepz8(_ddGF3lh*n_siuZ`d4
z;7E>){BCe}{AlN;GWy*O{D!>A`FRhBM8BweOS{6f(p5uEox7^ivX1NfbG4XmqY^2p
z+d`w7Mi-_|kJCT_H311>n2a(n?DQT`Pst0IOQifusG4F*NgeRX(Wvirrxrg`8$Pkk
z*2w!+GMkQYob7-$h;|0Qvn6<eK`Ku`-fW7Vzz2e0m{vf51F$H(d=<Nuua{FleWJW^
zgTVgmaFN!s2cDjUq$ChsX|@pJ)3*VDDR=~-<AG#0dI13h<gTdPp|Cr}+6wF?KLbyR
zcnKXUb*h*GjA|;ZbAS|D?4F4{q5iuRZV+$${v8t+HxVAfACB)YedU>-7ZO8NPH7-_
zHml!KQR4@0VT)wX=X{Eg@GynD{4Z@|BZ(`@c^P)>b6Hqet_eAO3~4{;PK<Rz9$2p3
z%ROxKp3A1$VrtnZM_SVj{;MAG>y^@e7JYb#cq|{^yRiE-+I*`|bT4Ed5<GGYTi~D)
ze<2>itPp1l{oaa8-4SR%AXEaR2^mbV2eAmQg9-i~L|%Y<05(1|BV$*x@CZ~EJ+C44
zu?`jnWD<RUA}(GJx^jq0it{?$ejzE@NZV9XRz_~>Ye1G%%<X=M5^va-sJfKY)u0K0
z<nZutopZ96b^pA10JdFF-{e7-Lg6P4!#51>B7A)t939U|$)X`|NL@*(eyM*YDvAr{
ziO^6V7iMNE=zIC+<lGhS<GEK06AqE3r4sV;-vD*7v)fW&VrO5~ad8QF_ax=>(`ODF
zoYQk<4`s~Amr?GuwIgrBM+)@o%qRG)p8S}QX-$nckDV#NnMr=$!PG2lehJ-D$O`{2
zjqBrykO~l2!hrN(?U`3#x0;!Zxd#z@5XA&+(-P=1>KR!aAioBX9tcGR6H_oGahQ4P
zt-@Uj%CrDcpZxsaFdX!I2}~dd8$u_z>6!r>I%^b0pB-Q-1HG@WD8`+FTX0{1p~m2s
ziv0PR0OrQ|U$t05GE*?WjY1d9z)DtApT4xE3e(I-oU`8zbjd|aA0AX%ns<Q^4hEXW
z6mLe>dSsDP9HTYwCkKY&;y35#Z)0LQL3eTS;+gY?sHo@fS0-gb0KR>{poJ5nB>zEy
zpJ(DxhTX&%mCa9cnFx~>uqt3ugi~kNTE*T7<-hVzuZ2uSjL7qZ?t{Dt9Axnd3lJy-
zau<RS@aM0Om*Y|T*no2%{U31VfY-qF^oW<YYZL+ql9H3L@UPka`K@KU2WJNkHa55~
z;h?$x0Zvk|{gHK~HPtLcI4e#JLm&YF@sMr--tOW1v%quVV_qJ|e`P+Q{Kdlu3Om^^
zgBV+sU}n-N(wl^L;PUG`YHi}J*CA=byKNVM3$Fx-E-zEn)^g(FUiUaAOy)5==}FdU
zXtW8x`R!nc{hXS=;oG;OI?o$hzr=sdwX+);dw?4q7`H!5{c@tBV0rfFAlG>@U(3l!
zIL9)j{^RLVU?5>>={L|e`phVzKLdEdxB%iEcr%k69i8#%)1T=^tTs4JNw~SW`S~lr
z7fUk7GY57Oc*T0f1{W11(8GBkY!H2uC|!u#0WqEU_+nT5WsWhpEMNro?92=ubqwYq
z>sFux$fylSOCgZ>?RDN<>1H^PU|Q+-3UKBwt*sH*r)Ok9O2)qqPe5RQ!pYRpA?=6R
z4*ffX6|4LouHHJTtF3$ERp}N%ltxjyr5iz%1}W*3MnF<h0qGK?TUtW8I|S(lLApyi
zl)CfldG8&+anE08oT0}pd+oL6n)7)+51N0CiHV>75(I{vp5S8QokFI-qZb)Tzsv;c
z_<CZsOiUWpl@qsCDrP%r&pLvuoSbNMb#IPjhuVRRw^Z|bL;3shmj91P#vEBY6}6Z=
z=2=zM?L1}jyM&oa^_Ht&{teXsmAKF)wO?!xj0gw{da16Cwh|K^4V^tfzypK-T5R)3
z&+9TeDG4YU)q{hBeslXL`};R5Ov--3A_o)-WGt}}Y)_4i-Mzy^)H6OFLt;~a3KIh$
zJHcL@>k67rs380ih18G!{_&xq8o<c8J3;3J0HQPnMc=-8K_ktg1F9!Tk{b&VRU<Vu
zU4zkS)#BQT9-`$EVUokX?}vpcCuPo{%?eqssCY5=@HcMtZ?8E1Iop3f+fWArQ9h0X
zPF5Q{z8d^YmuX>FLVk<UFYWC&V2(qXx&><qMnTb$n<%d}G~Y<eoRg4YFZOkq__%yq
zpVM<X{T)5lGNxGnU!R8vt)ou2xk^lX!-4;d&1D<D7U&gTgIOFvCrVFKQd4bAP4AJC
z=I7*~Js~BsnTH82&^c~+u`n_!tEg~-UFohlr>2}A*mD33*W|K)Ynf~mGEw7d!*Ps_
zjX{+M)uD`()Q9H${CurfuiXD0VN6MOFd%9r&PiT659s0_v+<3>Y5I=twgnltRUDu;
z0QaBh;k!fhUoAo-XwmPr+Pk8f|I4IrcxzNDAU2l$O;Up2_f`lC3Z*E7T?{OGo=59q
z=H`w-`GuKXQPCPIHlz=jbWT_ONE?UzLvg<@sA0cpJeeCBVy52^cR9ltUd!L!IZYOZ
zPs#&AV+ryQiYOU%=lIA-h$pjDs;B?26}5_jXhk9VQSg4u;Z8yQFa;IuIVL)rNk7^1
zjG@*xy|}pbk&Q2Qc5s3QW@p>~JszRpSJ6;YlUA#oDA1_Ab;}#MB_Q0(<9O3&aj_J(
zK^#iqM+^+Vr>kxWH7pp%!<?o@Sa>2j`Wz%&VN~4)L`TTKXG6gXt){*^*HV!9pOjM*
zjc|7r;cC{?qrAf<Zm(SY(`uao1q}@<@k&S1PkPAzJ>-uimnU5xDr&l#Y#XwpzREtH
ze@p#{F~8>R$}hkud^%+O`g|c<nWrir?Z#Rj#Y&<#32BmC*ntibYkW=Qy6t~8Dq<LS
zc%u%#=%>gl9+YYXSD3`(Hu)bT@H^ek{CRV~|A&;v{^A>>#TmB6;o&y50yfw(9tJET
z%5+RUvvqPIp~RENv(#esWsSk7IRBFp-+&F8^WX8+*I#W{OHBO3&hCnj?@yC9I;6)0
z-^0+U3xcyMzQeHUn7cv-G&ekNeEZ^;el@mhG1C1pHz5wF(p6}=aVh+NTuF1++L<1I
z$V0>ZRlD6S3fNSjS7!bg@%qf2AZmtHn|E%!Q1t5g`JLV3fsZwJnbw;fS8~GFX(s<q
z0Ww`bEH}XMapM<C^tMo2T{rq+g=26;*#DjtDtuVw*5>BCMt)A_=I<ECdpr92-W}%!
zE&IO)V3eRBCrwm`iSy^lRcurnW8>`@=1wU6{y#C#jbNQ;9YN_W*L%_CnH7llnfVx|
z+Xn}lO%TH|?jb~4l+p`RNtU8iO0p$CORJYJfmkkNULJP%iuiw}<o~|fETKSXd0-$e
zoaM+=vneT-F)UoP0g-?0mGHar=9if`PFx-z6L97YGI1z9(NOY8OsM0J<cp3^l(qka
zvFe9I5gBRN6Pq{|WG^2f^9QzJ$Q$1W4J2SQD)>eJdZjoKkC7qt6ksixz}b$0=>+7&
z0{#hf34<F)YPp~LR*&9pVu#9qf{b?@IDx+kMRgc5YggOCp$JCXi*&>Y3TCLDS6(yK
zK-Q9xmj|z2=8#$6^cCE@a`WK=#q;LZfAf9^r0JL9hK7a)2I+wAg6boc_?#TEl4H^^
z7W<--l1T0z2es(0ujd{f41d$Rey;8u2Qz56@sySwa&h?!dy<W0Jdd56%p4vDE*h7F
zgfF^xN^~?3w+@3avnwk#!7i=WAgx~p0RVH-GK`-;XVNM-ftWiOX0FydpDKPO;uD=r
zc9|WwepJquCC*4sA2;wL692C=45EqJn|H|oEBDX8@&OdeJHd86c0k^aa0imnUyfcV
z3-~Qyqc0GO`e6+91>EcObaWa{E?{B6I1Pq42&s9++H*-s2t2q&M&WAIH|-hm2dqLp
z5H*qNb@jm&SZ{NLpLD^*4I=@#c(yddL8;}*bKUgsgL@2;<##Zte{H;+nVlUMu_vaY
z(lua{^JNiHR-K=1PYlM<v#_`>eWzC%dceW)5)*S6IQPKsmQajvd-pEz!w2SI)yH&Q
zu}K&74Bc+uL&os&OOQ~HAccG+^0D9QDybPKh1N7*BKJ>qb-97AGt7^ynw+Y#vejv6
zE&~)fsJm^~vH#nL5I++5!ze)a>y3g1u2F+~N{I7OLsxeeMDt*ijO*>`fh_2|gv$~H
zC`xFK$Tq$G{lAB^df<$K`mxz9FN-h&O%4C{?f2NDNf#IHU5c8T-6mz=8!vwW#6Iw>
zY}R)o?7I{cDNwwNUEjFFwF0wDQO~cRph}N<+NK}o>)Q@`aIrFs14*88ISGkJfJq0)
zVkJt<cyG>-8JACJTcx;LT5^PcVxrc|m!$??!07U}-UT@yD3{ZXA=>9v@(<&uq-pHM
zp~YHo$4GzpqS*vtPL>5{3P~T}R|BCmsOI0EuDn{_`gcahq_OW44UcjH_-(7ko>oWa
zG;*lvO;)CRJnvAY*L9=0Iq}t19U-A)R~I%HH$@Y`^Y-?pGoG`09)5@1E=rqS%Wrzm
z{XXhEVJi*~KD2i}b=#ZY$aqe>P%fJ>lp|ZBs`eX#2t{wpZi)eK`XzNnh2lzuC7Ma!
z+gnj{4bypT*x3J>{1LNzN2CKj*CVsrI=5}&kmq!(-spgdev-Ot0hS$rp??EO+8$){
zR7hPrK#xyOPBs8`*#?Z%;d`U<04q9}D!Ob}y6-|(uA&Cp0F=Bxr>4A<<x_>pFZj5)
zRJF7OXld7=?4xSauZP1K=I|m_7U2H}Yc5>1;8XyU?oV*ULJlhDT?c3A&M4YQN`42D
zVaAs)kc6Ce2`RTOd-EUs{7`#SpP0%67Z182aFtL<;=Rd?Y=a=f0sOxlQ;_(e%?Xg;
z;w8|mNCnRB=mBJbfCN2Gc`ghZ_LratNY6EB^P8!$?`&>vhFkEjbD(!iJ%axw+)G>~
z5%)j%dZ3GQPZM28o-TC?;q<Qn`)f9$3>>yY%wJ`sgU(NRNO%=>#f>7GuF2ipjO-7I
zksk<pHHKE59)p}8%6-gCrT?!V%)v2>L(V_YW7!Zs(07mZ&d$zS5-&clc_eJ5J%t6_
zo_{~vUH&yFUGAn=CX0@KZ4p>C53{=d{uHgsR}2g%KO1K(H%CjRD+{;h7p)zSHs`7+
z-I(GVWxZUqs&2zR`-k=Yq7Khz7uHwB2x2@9fNK}m)||obcH5|U9O^j{V6qrHf)Nqy
zUN_MEveG^c1~0Cz0w<#zz!5OsXJTeHGBN@LLrV-G`Rx%@obC2@3FtPy@cBqWO#BQH
z2jzxG!ubN0M}exkx-<ClxK1a%Zg2{~@Om7;SGnQgcrTdlSEQv;6*MB2L76Ec9b}tF
zP*59@?EMb(YS3W2`55)MFgN$T<m%emH;pn>=^F!}zVbg2&4T+iJjTbbkf|k>37>f*
z0Jgonye5$pN#ghj>8PP4;0Em)OkqId&p>Mf2C2jr2nI;<(!zorAhO{1WM^mN-e+qb
z9JB?LiWN7h(^Giu(1<}fzzZFK3<z{|sxE<9nHZ*jd!qPInoHZ`_u09;h(NdIn{I%P
z!T-d9+`@0SxL9JO?soE7R_6R*eTZIHm;S$c87OdhE@Fm;@1ffr_va&)03;y5`9*_X
znYEH#UjD76g=$vYiaX#m=<0ICY1gU1vqh`9V<n)<`L;F<1EU6<gjTawQo<i+(>tdB
zXm|C?I9J7m%$a45j5ABX9PtVZj!XbsK>HiSQ(vHY@w5rz44mPw3@Z`k97Jxfs%;<K
zF94j#8S^qBJn?J5M+;`Q2)4X{?H$9(+QbC)EBNcMt}ybafByWyppeKdCnbd;@%bxZ
zHyW+~v*QZPO~aB`RGSf=$D7fh#DEG1ddhVBsJn340&ymLE{B*f24IFTn1RX$&Ce!>
zjXTGsr*K<^0CWR$7&t=zmlNCJ9tkQ9o;CO?Q3xHOt1>C-?fyX<BnU(}l)9Y?SH<>s
z2ngUk%E`6j4#6vNc80WB<b4FV7f2TApAewFt#3_KKJeBO5=gcgU!4E_3oXYJ&~6|j
z?&0K--zm7(PWNldU5{J%?2kIykr$Xi7MY!1viBW_M@~+ai;I9fc_Ag`&G7Ihv?9jG
zCBgdy8z8hJ`_F(@5~ZG-&xl9zC^c2D+V$A|`SYsGOuU5!+83$cN-JjUPqyWpPsg&e
zQI}zGK!ZkvhlevtI!?|ZVQ_urNPa=S%NVZLXS+|qQ$3f=OtkEIUgq*i^<}%UIG%h`
zIJap1QphEU`i8^Ia~5^RhSggr+ev%XBqdSFz$X1!J_nj5<k<19&xmD7S;n9*HY#MH
z_#8qxUSV#)kEP|&eWrjF;MMKy?Bwy30IeQAJ)~}-*WI6?C-8f+#OLd1bMp-~Un_WH
zQ=pLdSVVS6K_ArIJ1H(Mo?2uE;_I*_WQh~!#`eNFjqQ~VrcH<!#bguV=O@|bfj{Z-
z<7e=#0xTAg2=j+<4~?*fYbF4OA;amz-8^P*C*YBPmEZU4Rit(SR|;^==xQomfPd0P
zlq}+va%UH~yg<d-254S{U;Ne8l}jfSR=6@oi#VqGDSLZ44<3Y{{UMf=tfl%6g&>ez
zAF_Tk)Be>H7w7x#P!?`7YRbxypFTxKN8frzixkWTlP|}W?n`LL2NG~S4^sj5`)lyt
zGzuRVaWFr5vURkv1@5OPnvL5M?;LeTMV!pmmsoV#VM6euq?`YiWAJItFPkaSaF4Fu
zRDRi?jxCS!`oCDu^n0-nitMXO0vaoeo|%kWwX!hL2hqHchu(2+h+*4cc(O~4eUHWi
zR1PG8!wNG;k$_WRyqzC!ff`ocA`dQ@aI(R91HsAg2H`kEUjsg~{-@$%wzEg~=`;1}
zU2X<~q5HNBBV&GfIj4n7A%M3doFILavfb@gU|}IM7t}BP`fL_em{UmP6@Y)dlj)+6
z%+Dn#2pk-7LzpCh%Nd#YumElXFzXn63x^o^o4%?J-Y;H<p5hC52m}KO(8W|A$+|(p
z%M#!WI6&i!b}c2Pq3P*Fjx|bKK&Vzg?4g3(*WBD@NyN@u>Ebi(W28JQ45_}iZ6@82
z>1Ppbenxb}4VeFt9BF5wSXfTqv#UkzXJP$1@-*wqX}_M9mL>r66uJ%gd`(|m_pDtD
zRbP!6xlV+TNrvL|7nwG*>AuA;D|A-KSE_PBN@pdAbwly!y&BICHv7h6m}t5YqOv(M
zUU(U1B6={g{nF_!dt;9NAaeX1E8<KtHwW5-$eDJgBUgNCcux7}KUVT0T!Aqj#hYmR
zjj$yHo(a%@k|FnRs?`AW1VGmiJBNrS78Y(SEqw{;zR+3&5tRc!`O%$&qs=i(SUNCF
zf{FnoCn)DAayn%uI4-=^>(H)+Mk}p+68hrK;^I?POhcvz4{qw>&`^?+)`5Tu+H`Oz
zq-CqZAY)JoArQ_4oR<)+JWvg{Fu?gZ2mMNDvZw*?QKscPD6b6qLB%OrFXnoWSrd83
zxA5#>1<Q=@5s;{00xSqgh=6qkgMP`+ZSaG_x`8d&9b{GdOK`|#w3C+q0y8!D-dY18
z2?+#3Iv;PULU!mYd<%ed!4L##3-uCqU3Z9hA>7*9k}1SxzWKr5+W!d7OtE@{*gJ@;
znZt{DGPl*<anNOxx+NyKw&wQ-S5$x9WCKgxlPCG9nCAM-Uo?C0-vcIBA1CyqNHEVl
z(b2$ES&IWjN^7ejL1FsZwkd(!eVVen|H^K2=W^10Tp3mhf(WMWfhfK15bj8IbDe!u
zQA6lE0e*50SpBx@wq3wS-B@2&ByN5SCz~jZm=7S7U<e>u3_+>QPxH=}sfHua93W+R
zu&WCl`L6Z^#MCM&K?!rLau=?8m8GRX!Mc@<Dvqjwf`KOT&3p`sxrJ|abpjVCpZvze
z(&1uKQC{A>3Dd5YFX`oQ5rgB6)ZzmP;H@|yZMwDfDcdw$<-jNiYyAD+r*3I^gYsS1
zz(qPE0Z4=i0I^{&gFyv?xClKUN3RrDYf`lW`{$Q1_5{e`pdUch#f>1K4S-JX7TYn*
z_^eQ@SUHiL$mxW{gk|soVVWmKVkNS7Jt=@!O;RWFUc`A$t19`?RMvxpmk)6`&8R+O
zYGgf<qjq~PN&i^a`1QcHgt3W#f%GT*JWiXL{cE99C7XKPKW?8K8r}a2o(irdt^HaH
zTl+I>-@i-7U+%c2h8@Ur>w{So+kIInsZS?6Sv9|Q$wuXJM&OBti;^?z>wBHmhrg#Q
z(oqVl+$Czf%5l+iRhdt%zTVG2-FftE<#GGN8LOnat#Su95$|@=5293><S*)*umgB_
zjYUP9Crga3zj>#H&DZL;UYFd2QES>IG?3tf{*yOwMEbMF-V~1g^u;)~bx5%yVvoLT
zvcJCafJtC^7~;W{2RE=C9sl$vt8Q*J&BbpxFobvdUtg=!-@!QFUc{qsr=$IlR`KiJ
z#r<5B?yU)%veLK01U6@@yCODiZC{+7P!p7>$sPDBXytJ+j$Np4qa!cYi7sBghL>+;
zZB0+u4W)HU!{bMdTwgJRJEp2^QZq7^ysVm`rumwm4`EFg%BibWND3hQD9=MA#ei4$
z;xqvwR~?{{0N<JM>(^4!(v?R9m$zQj<D<6^+#r0@6H_xg+lR|_U4l+R<HFPMH|irn
zQ%yUE3C@;V6@FYSOLBSf{quMi%37y6$9<EYA`;Wpu#A^<%rjjBVvC`oRJRLpAI<9d
zwJm_6mBLC<OzZ(2T>!fNER!t<+%tHgaA5U&c-}OB_$M^x+*CZo4Fce>aByC8c)jVZ
z-$g>a(lcUrEn6;GvsQ1vT=QkW&~ULXQWHypiQw$ZS>pv3UGdfL552|x`y+omqNUyP
zYgj%wz>|rw0-P6vPYXIW8Mk5kdv1bZvh%8W3wx)Ol%}oGf`!Y~y6d<S9TB2Ov*^gT
zhBigvVL6dNPIhbm$r<YY_qOmRPn~u{c988H{8@pUG2k1hCbi#qSX(nEGPf+*(Gd=P
z{d(%?C`g~@bYG28<qZVh*;!kA_~UlB624bgSBH*5n<2ZkHLZnZK8qH&l%}R6-DIz?
z@6OPWmXT3gf6@<_GrNhk;6Hxc!A&rE(X%LIH`CG{6+0nsn`~xtMagijLA5VQbvr-}
z4W8=@aRj{dfK)O2MLBx<cOvF6XR6BDo%6D_UFUlp<X`hAZeD-j{ri1SPyFa;F3@)E
z{<Qavi5bpUe+-iu9L%?={CC4;I6VGtmYIH2&NIfr$$}8iFygZLKZg_Ujqm&5-<LRC
zkF@UuZm;ne|18~TRr<XQ(F527FQgg3&!eY2a(s0;19=PArIVAXov|GP`j_byovek#
zyMjMjzGP{gp&*_ST_PXx-iEOMTSaMn`!Pocm;iazP#fO|^d25L32a|i^DXuSv@9C+
zytb>oFmYUiiC|&cVAk_+7KjJbs&i5n6@|nF+9w=uWK-%d`V#FwejHF$P5wPQQ(?E1
zE|-X#D$*qO)Ej<J{}6kzy=nAVzBnDBW)%MRo}O`k(t34uG73tb75Dkbhb=EQ#IwM=
zWrX)^4?E`%F|n1qs!dYm!`xW)q|Y=$^6>G+{jkk9DlY(~{cjO`b%}BjOv%p9kBVC9
z`h?UO5<k~a*YAg8&n`PU?bZzl>q@KmwVj<HgGCaOm<JD1uC53mBn?ER6&~l_*3)5t
zlW^airK9Uu+E++=1%XrL#e0qx7T&hpF3Dx~X0a>`GBLiK8zZP&*}!=9`W|vtU2S_W
z_9~(<OJRWZ-_jSQ@<e%2Z>LtMOMZvU$|kLUscqvdi`Ms1$h*UzVp8|V&j{WZixn<$
z{oQ5TC?DMl_phtlm6iJc{WadzyE;0?{&k%Vz&}Zwi-~Dq3jqm$5nebkBu2b><FT}4
ziqTQ$Y`Yim{>F8Vg3bCSZcpz8Bfa_DYF}Psqj*7K=*%xN_K?*r(mxk}wv0L#eeZb`
zMJWfQr>}aWP|cF12eqyEL2%NiA5pQikNa#L$%71br)8aa#-%&zrhj+-?BLql)L<xi
zqCKir%Lnh_gIS6i_3T;@L*eP#aQx?5lHZQb%tkeo^t1U!hT_@C`y+<zEO_V9w8+~D
zvaM&W#pR)$A0;p`n${Mud>QK_O0J9BEg8k7Wjh{p_0Br3h3zoUd~W);;dr9<;c(r<
zz)XM^UI(+*s#0D#tpX+!Q?28!u<PdNCnu*>6O)WPce35w!qwHdODlF7EmU9G?sdz=
zzFAw|=5yyAnQ%5RT*S7vUMVObGd1lQ92^@+xeKi5wmnMwKNJ)rSeUZ~dX#w?8PXqQ
zuxS7kzsYAG>4&$w_IvjF@*GdxZ}P>9fGXnCH_?k83#zKbXiYq3otbezf-4>DVcWhn
zBv>d}ZxgF>{+du~kp8t}J@dG<=F1SuSARAqC+E|s`afhjYR{_6qc_&mf9&<2CThH~
z`<W(6A8GhU>#X?;e;hh_nkBq<7foB~^H@F@0fhC8T2${#<8BzSn@Q0YG4x0uJsJPH
z;7)eXnW;RoD;TQKpmO$GD6xZNy6U`m_V2BNih9Ls40b6e+A8w~RBTDaMt$XyfK#NV
zrS-&8t+mE}qN9EC>gvtTlvH1QRI%IXTRy&y>guM_(%-bS--ZW^@$n71x~19Uk2->p
zu(2K3Ln=-BVhXjC%{ey*(>-!z_aRQhRIDX`eVx8_>prX+rsAU(AM1zIw3q@7KgGoC
zS+uLE>3^)geMdF>JiJIv?f0mO8lh$Mzat|+?Rh@_ZfbW`08i-C>_~dN;GmU9lckFf
zcNs1DjvWtPE;=6nN^oV#38STEd8w*J`~6E3Lt&Z{-Ol5md$*aiK6kjkJR$-VmvoZy
zK;hcqB(~(fzcg={cS!Zoo`%ui)41Oz&vJ$WGDP<&(n~65IQ>t@o1)drjTKynA60#-
zq*2t8NzF-xa0hr-PG^2A?R&q?(ji7@s;{+JI{i+WZ*V9(7pE)S^eukzvKpYuxBarh
zU%3A^g-wCkm_SW6)FY|7-Ru=<efEN=RrQs=mxAS2vnS)B4-nfqdi%GVfjshMi*gwe
z6!)T$zlpTd8c;G=-a^%kv!KV-DYgC+cXM24%>SA7PF~@r*%u)ui$pn|(7Hd*s0I!N
zcj^m6bfZ<_x4l$(sn(HK#j{!8+|ziy{fH=8S6eM_EH7CmoTV+++Nf-ONLZfQI;7hD
z1G*2l$m5;)XgBA@r)du!G5j?85wo`2{UqZSg9R_MM~Ww8F5{z1Oy6vGTl%gcEWG4v
z>FU9pvA27Vb)2KN)<iiwB#x~`{Z-&RW!6>Uy!xh<WibbbJjl%-{`Xf>KndjOpXf-G
zeJECYeB|3tKE0z$uqg6mV|#{{&$5m7F~y|YhVJ`}KilLvMG|P;Qb7rR1^XM8rB>cs
zzN++f{c|;>lqB|h?wiN{;nzXGoHi4@78~)-W7f}~m)$R4rvJ@}sZppbko5_x-a2H}
z?c`ftTiv~C_r{5Xo$j5iMWkO043?9!fBlpvzMQ7Y=y51XfBxxk_Lmalzn4fAmFI(Y
zx-mDG)b^+Urkqt}SAmA5+c8IZ3HyTWR9ES=$bJ4~*YlI5_I8i|9Tm#3og6yuXDiX;
zVFlyl6a*bQszwkcCb+wMdxRH@sZoBlloL19t7jD8v=0ew{JV6|#`+;am0xHv`(XXm
z`n%JJxZd3NB_(y|#7<6|=>(-$FBGh!GdE@>mtrP2Etyj1HFY$oAF_O16&jc;F`b+j
zzRN{+C$N3(@VSJIdStXF>6udfL~3>IqW<@v9iEX^M0a+HVC5hRB9JEn7xki3`W2I;
zhfr*fwE%US7t#ND@DWI#B_vApYj;rnv|E_Z-`bt)AI6c|Gwro<)ta1hs23|)J1vP_
z@$P85f-Ar|*S|2Ze+W1f{nHNCujk|9gkGzTo%rz=t?F!tR?so|eVXGS^$1-r>AyQx
zVR*;t?(RF~BsYbd`$g?GO_NOD3ko|=b$dOl4frVxljX^4^+&c&@80`Di%)!I^{;HH
zL<*G9ta|g;7en}3xbvJ=?C#YHw{@`XB99zzXwu<EkqE_Vp0c@<Q`El-0(mh9D)5}2
z>8sil1C9n9luN9Kb_{<Mm0l`YrhOfB9<S{aYnxRcb9t7Nq{}N-*iimV;(~BZcb5D6
zZzTaUmEm*^VVRSp8P5~Rf>cX4sgtp!wJ4!;NAiZj>_pd)-L&w<9WHfsDbHh<SlnSN
z7d|sW>8r%k@lP0+#}$7`EoQy;X;$Yy_O-P6zjDZbgVcup2~m7iA_X`<ZRHec8q!N%
zlCy{l@3WaC;$&(J<@(v2ux+6!y%#R{%Jb?lfh4S`NU}|rLj?>SHA|hWj11l9XS_dp
zs_sz~h5n)_23n4h4Q0T;OU7-HyN_mh`%u+06uWGs8CQSvI`Ukrx?v0vx19^SE4i=s
zM=jI0EzBDChbiA$cV+FlapU@0N<vIU*z1eW$FYzK>7C58TVl_N1OKErC17$;TJXx8
zCJA}5-jhG(J*QZLWnK7UNGURrbMTggrmxs^j3-jiz-g&{*&v*zu144lT!&&;2bXJ@
zb1~gPWtZE1Nv`YMI&0^6sEEa8Cg3ivV!vlxE2=2-P{~(lxV@Sdv!u0Bw3+r(K%*sz
z-xwA_UMtRfT+&u-CW9pVAT}k@^`nRf3&{eE%o-elif^!uXlQpKs>ugNe8*dOj~=xv
zq=aOQm|IKME2-E+F(7(@8R4?z-t*Q_1rr9zqjsBJ#vG~PtUu+llo;~)Y_%SBXBrV6
zw3&XhrY2B+Sns;Q5cT8pl<OOz*yl7+67kr=sRJtq&9h>a9TN8iDVj>o;jig`M1o5e
zKj*@P8Y*o%%qXl9Sn|w8`;DBA2kE=$|JJ0q{bHyV_~#EAJxp^|Kw>&eL!;9ipOKLh
z7x$`2EB0vl$Km>VL|Pi1we=YFa}F3qsuf{#^YFB?sab3efu*6^_Uw!|lw81Jw!Xra
zxYn69{AtUtnH|Z{1o3pm&QSZXuoU`jD_WQbVB!v?i=T2?5`EK6etUiOh9>ISx(8{R
zYS3xu;%1Pm;O}6<C_}0?i;Yz{A&|ek(+qVvJiA~hdjGW5<yMFs^(U!i%2#|3X1xg*
z*;%f)XqV@R^dBg74d}(b^6j+2qa2!6#6nWN@_X4gD%Dz7qt)4S=tCt{8lC*Kk6ioT
z;uT3*ZBHa8Z#enZ6qcFEBrVq$5IhYSG@zmwLXiyy6F?^h>@T4`lk7K}D8dRj{i^x^
z(Eq%Sn{N#khM)$RnaQDjB>SzXh*-dBA(V8Qh~-zYQD@-#x;|Weu&_MTClpaoYEZB)
zuyKw_T)WJ7=VC=Gfu_>_-4BQIH<R{dUFaZWU0e^<Z`oM<xj+6bYCYwun;B@Oum(64
z;LI0zJa)CFC(TCpnrzmDv7;YR6usL{;1Y|X{YJ6dyK>mH>mKnV^Wf`LT-!a#pK-a}
zGCVttlI{YAOp;x-9z|^pl{V8;!`6RD5!bp!(>>Mw?{#%!V>{17Y+n^qxdZ9pd3Xw#
z84@K7kG6PVz*$HA95*05oGNOCi_7h!pDSn+n_TP^)SbIZjM(Yvb3nWWc@$u_&MDIK
z0*|{8T%LORJUq82Q4K1%d{93+ZG5S<HF!Z#mrX=iAIHV`K;Rr^w@!wO4yZmKT<g6|
z_kX{eHy_v1uHO*|q;OQR{y%RxmGNu&Y^sdu%T0^wiJ6L!70=~nFw0iMu<TkFFIcj$
zaIG?o;4kUY&Q1g5Imf)|m_ytMJqc|)7WSck%mq_H=9!q7wfsf7qQb(VAv+(L@|t(*
zToqv-KKuqB!Y#4FWS#kq5p{0s1YmTtVH1oE7AFb*^f)kXYHcN>rXCP-`@(G_KU2rk
z_p>Ja^Jf=O2a+ehX>5FwBI4z-{VHki%YdU3?#7fVXh-x7j@}9$k%V_<DNf#}fmQ4%
z)R6x3r@--%0||fIwTot!%c*c&;&G85Yx6dLAcY;`QVkt|_lR0wPwkj5j<MxS%!I8C
z?ii=lDRqaNqm|VQ#oMQQrLMof=`tkBKW<}GoXhl2LUz81O_JKVR^H&Kp>gMnw0O=*
zIi;^(0`={tNWgx=u!ivS{UUT(nvt%q7tpz+CAS0>6vX?~LEO;i&nhw^yR2;swq~3p
z@fXyzoF1Q6{SwOlo*d`pEhO=l__d@7yPaq%m0T7ZjAUlETUpngpR@LKOOKE3WPP>R
zURvS=cqo7%q@rR#2|z=?F&}hS%jVZ44J&I@?CHyA&xEP;HN;wa2x!L*&ul0KZ`%+P
z?^jrwvc<QL7&B|%M{~8&PxDyqjb3Oo6@2$rR9w8GkBPzWzums+7VP%zWV^NEO4@Am
z-%@UK5Q`W|5WN)7w2QQj;kF{6Kl`<<Kx>MnOcLpf)p+}v8DV#W_k?0jnBlqW(=w?z
zx~R&N_181<Z;5nNwl=3@+=3Td#<EY|O0<mK4$upNrVq%ZVRF2+xd~7_EftmWcZVhd
z0(Gf&1u#dRt#cmu{@qVHtMcgZiUUXv5D<sJFboJQLSvz)ukUm(ZhuuSub`khQLE|A
z3@0jX(!tsOO>hR|Q+o#o&4N@$ReiT_VH85NSC^NcKSObKtHVTgZ1Y!u)K5Pk5Ud|{
zM?Y9tU=OFbn<PlgS`ZWYxx912SxLgzSw+RycI?e+rS-XKpLA~F%IL&ljyyRrg|Kv7
znd2{t3)BCd{_5}BN!PT#Bt%I);*UN^PiQk%%zgBh*`@Rbo=AUN&PS|y)?~@Md+ZF+
z+Uk1Y-@4?oM6Ks1u=y)$E5l?Tj^r5^iHZ`5QZW3QL1+6ASDo8mot0q~Dz44U=crwc
zm7z>$UX@zSBCjhr%wiRx`r2AYCiP4YGsq%Lv>Y7~l(w4%ILNjt;>k4dL&97h^p^Vf
zQU@R&6m`-CJnXfOZF3ih0inpm*x1J%V15%45{mLi{qXgN2Iz;-sVPl>UAVl`(0B*X
zL71`KLFxh6JjB_}iz-SeD1^(>v?+m>RZCYFzT3j0B6&{KFmyh|%;Wi^%|En)6nF>k
z=^>rd!XizGpPwIr3{QaYn1DHc2xkY&G7OeqlrIB;#FiKC@Q|W&=lO^6);5UrgSiF>
zdFarOVP5-lV88|jwUBs3Lqo$!va^<aV)IHqq<6vg;81VaOy%{;iMuc)EPw$9<3L+a
zPgzL`1Ts@KWh*cZL&d8E^mRl`%tNBPOK-P=f^;WFZ9hlfeZ{0ECkL_dre)6pq!$>#
ze?iL59+R1A+$Qk`CM;uZ{(m6z`R!W{Ysngn|MNa`{y5hEc*O9resl0+6c0tdV+5vM
z$uh=SKAuVcVd)m#V+ybBFX*8rKUIy67Y>E^8MBOkMMfJx$1PLZrNZLjt>f6W5^$6G
zEKhD@GyYtG*<x42oc$9P-of41kIy$PdHbg)l_cHMV`m0uQExTg_$@I((LmTp&*j?N
z93kexU<vn}MK5}&kpqYf2Ghi>xZSqK!^LnK8XAB>4?O2vPW)iS2Z4%D1VY6Rb2~_i
z2bsbY<SD{`g$66M!Xe!0Eld|6uVZf(6eaKvEe(x#gpWSFAuouroIgZpvMnH~5>g=1
z>Ojma_EvNV<{rt3iCCnHL|?s2BdlRC1=ExFCqNN@D)!>q)aQYMyL&wZTqYzS%n|eJ
zAYKyz+Odnv87#(i2mWQ?xTl7O_8o1EwA4cQA1o-E$B$9`v2bxyIL+^%GgtGE-iQYo
z1scx#MTj7Q&l(^Z%AlED3l9!H&s$m$tB;B_3855`tj?Zj5+f>Vy|lyl?H64Yh;zqJ
z%EBxTG}B;v?ko=S(~+Qmh@=qr0z1(rX+*-}6?VNjX3c|76XWdyur~CXZv1PQemKjc
z+W-Ei)&=Wa{zcUI?<Vy#*He|<h^Ch!qoWi_?A>;=dR8<uUZM)I9}}H@L)lbR*>E*>
zeu>+B@zGe<doFeVy4`!fj^lT)3|}(G*w#_ubAcys)~rLe1r0E0sN|eBW^|v9YHQVK
z%}W>WuNa|r%LHO|7Gt$*$gY$G19ZX<uS9b3QULae{m!M;RVSE-7b_1Ue$pR4ycgWs
z06TX*F(**`WZ;_swkXV{P=aHkqMmSb!w{ngaEhRXhW!zKC44S5APk3DB>c3&U-IT>
zX-99b6)>Uo%OJ}Z2FalHLhFP6mOP-ey4cX!GI<3AAQ0Pmeeh;C@IydHw*a$Essa5W
zkl4Uvt*X2nXyciHupAlpUV{vF3P}7A6k%n>(?lUoGXOrR&v>h@;cWq4FNkwU$AO6!
zAeR*Jd|G%xhma=b14%y*SXh7{nwFPWW49zZiHn7Wbm=FkI0*cbhYwr(`@`0+uixd6
zGk`(|R%+4M*j$ZM?B~zbSu4nR;a5P(^m2Y|O-2(1lBhAeC8B?gWc2p`_nI#GV6DM(
zj@h=O$d*+g3P{kFjsicjBwU2u4t2c_EKo!5@D3U#B+6ZFosK`bQ~BIQli1R9*7KKu
z;ZD};D4NIQwrrybeD%A7ucw1%qi19jEI1x?mnQ!0ZcabGJI2Y5?>O)-3iTB--iET0
zhB%qL>`U1(x0KWd4PVkbyK`^?28g79FbHzId>%|pO!#;~R4zj2KP(mkdl9KR@eL7m
zRaKpGvrnLYf=OYs+w+k7_=JSWx;Rg<W^qx(G}0;97QpeLrKRO_D*5c$hguNh%)`M4
zLdD0%6qWg3zcx?9^b(v$Q{VJpF8>%JD{|c-j{yl4dw$<Vo8_JZiLvM2d<!JT4RYi9
zBl+RcE8>kxl_PQ^D=UexujkrHD_hE8m1wG{I2^9Z<x)E6z+MYU8zgvre%SDV`Ivr+
zi9wx)_!+8eTwGj{^YXHvR=gO@my6WG{|%7-yW8pAv*C}9yxdkjjd(@P^-;bwa&@YE
zY1{QhaFF1;!#9JieG0mba>+vDMRom%OU^?A9<;hsp7HM9{v9Fah2aCK47HuFYj5lF
z929q{gocKmDrZCuojtDnOVZQHudXr?Bvq{XydTSU`!4F@%%H{xT5lJDIO_c?xijKL
z9c5)Co?nmdV95H!kxes=jg3)JfTczx0}HnvWfKkM?*f-sPu`8h$cTuuJe}ZxZ%-iU
z6U+uri^ZG~Q$s=E>q6==ysFr^xEIFu&cKWL6c`?kmWL`+Qc;022b+--2*lvr<AVer
z+bQr*TpUm6&F`cQg5K~wHekkJOcr4c2qXP7Ul$RVy&GX?z&e3F6_NN0f(+pBP_4mc
z3j&q*q2FgssHrpHJL(9&2Zw75sUt{>;CVp8()4x(bBJNj$?Q<1gLF9kGHo0_+hh>y
zzknbmS9y6vz%x7zYKjf`XsrEm0x^sVeFjniQbQ2L!`U`8v~QZ~bSIGP&f>z#$}64A
zt;WrT%*Y?MhEndr*CSME_d_MK6!$)I&$}Na1akgeYc?M4Pp-o!BJ%k^7j<}<nDN?b
z8m}pcY%Ft2+>b<YtFGkCqYVY<{rRNC-8MwLv|=Ld&P}!n=`8sicQ!Hl+pOe})OH0u
zFAY`?-p?QVV+Hqcqd<-oQO`SFE0Z*Hwl{%=AHE0KvG}?$VyEty8*z3vu<G%4;t~W^
zl}`K$3|m#6fi}uD?!~Kp)OV=Nfk8puT3#-ba4^Bx71VFFKfNO$I**QvE4oGBeJ)+h
zPDVxsrpH&rawMEc^^bXYPGggX6d>{i#Rq)FQ_do+?Otf@iIN#Rvki$Zye7T(BW~|a
z&d=*BE&`bXe}ifatbQP^g#>j&V1WZCucCsh{>jevHX@G4Ma=H;qemco0l6$FqFUlx
zT*R8hoLfp3W<Url6L3oK05G%Ek|FOCAo)XVX(0^`G(_lQCc$&xB>iL9<IJZt&1mDU
z!J!i1uiqt0L(ogy{dAftIUryhKG%EqVx^*J_yh!Ccm7Z-z6d5i7(QP)?)$Yt016_6
z@!yLq_w-n<wllgCs-9BdWAd|uO=R7{DN4fK+u3gau)h8ws*j0T?ni^|%CazL*T3dk
zTG!)`XB$N8=f<0peRQ4=sXYI0`iBV@l@TjjF8=7ZECQ@l1|r*^ji1m69JA!*2JwA<
zN6Uiz#-JdF)<IP@ZfWS-=_oQixA@b~$EC|{oBRF!LbZbK`%cT<pLtg9E;7aV*sZOI
zwe`_R#Bb4kBU#f{YQrZmK&9CZ7jUI#Z??{?c%WYGR}DpJch{fB6BFf`f_z1l!@2jA
zjI;2afO8P*9t-fMatjMlFy?@!4J%qyrBPQ=Mh1(?5CTx=Jk}9u^OMC7n3;FVMEiiq
zL<rtdz+GRq9{$A#PY-xtzqj;4nbZ9KrUU;aXo=d}AZq6QJ8$N-wKX3+pT4Jl%ge@W
zYw#y^b#;lRK$x~2FBuym^$GyYB!nK276fsov=%oegx)Tr;}0iASb%Wmp$lpK8p2uz
zS`zT^idLl6ft3n0P_{?#b^6?XivI$1QG#ZBAk|(7Nb8f+f=~Mm+)*KN{<&`htSoX~
zTaa1`h~+@s4ai*ecR}`%_Vg?b;=JCwft-$bc2-|)a1`*~fGh)&2S}+tPGUK>nu5a{
z-^rkrZ`Q)X0#vQ=rkTmd$kMnk7#MH+30YzXNw&)`VJQobPu!NqA)w(#Rk7?9A8o_K
z(i%$lu`*j<_v5v2F|GiYs9Yk^x7^7_F#BoLK$dOK^2&5oXjZ+}YZd!LB!zE;6Z*G4
zB!8I<y!o%2`sMDRMf<t7Qr7X6HfnHd-hvM0?9W%fV}p2pMJv*09x%u+)N$aW%4w*5
z{Tik`up(2IPQ3h;pRV&<QPq+lzT7<|!_V8w4TGIeN{ND!I;xmN{<7zklBz19o+((2
zxiB~VW!D#3em&lZ$bmK3IcVr<9<~_XM$3+n+7(fGvd&55Gp?zdO!-XfwoP?3S_ca4
z0}3@YHSaI?!PWe##QO!f8|@4X9whdGf$393gjp|C`l$TN3k&-ts(?~<Jwl&-p!k4J
zaYVd%X9)uEYP4B$KYnHK6i4NVN=`<=r(p6=ChT4f7NSrY9)niLXt-OTM<>R|f2eKN
zy&aj2)C8_<fONc6Qc6!tyHQ6n0<x)=&s320TMz3{g3kQb3twvQVex6v-URM%5I<r@
zF-iFG>4yf0yNzQrcq-G>Z%x8xz!6_>HuP}uLA@v_wZZ}ehek*1LA3#C2=%lJSs(zj
zap!936-#L;2Xa3g4iCIwiq_B|RC15Z4a|_Rg+P4+idB^8k5zNU25tKHCI$Y*UjwTP
zBE7*H{!&E+nKB3}Xz$-oQC-ULlV|r(PfkyFA)ZNu6da$&V4Q^8&IfLBoIObpNKj@%
z$;`veO_YE<<qi%)wmne5JVlyc2R|H0{)#IrQRCsbN&|9CcsLZ&Jy4mYQ5pLwKpDYk
zAOL&a!-uzo(ZN&Jic;lCG)FZ%TIY<IZ1w3YCMOq8TCJY!(4yj$Mn><rT>cFOy{rK5
z5uXQtnKJDki2YVk8?UbU+1)K|bic5;c>Zi(X0OHBc0DgW-D#y;jzxWMbFA*~Usoib
zerPjw|A4!tn}ERSy`m*Kh?>ePDOrPSwy>}~nS64CN1CrM=u=8sxi9u()79u_&kwri
zR3_RiGupHH@?Mnx-??LBA+cknVx$=T(w@_)>iDOtZ1a*i^aryS)IBx7zR6i;MEub2
zIJ@8xrYu0d+COudyA>buZ7NyosQ>NMRG|U2RY=9VB>wt`2C9nEyy>`07_k?A{<mbC
z*SrKV<0ZZ*b`|Haf26zhdb%6Aj)v_~2O-)=EneKeJ{_d>AAcBw$gGMTC6fg9h4dw8
zb%DC*g_IQ7+sC`RF_(SgoRec>xFO9K!nHvm?E;}z9UzH}j*V>#6<BBCEdzPK66t*u
zI<!$Ki7!*7udxV`#cmzmO#Ltg-5&HiF3*gRARHQF4BVekh!BQh^}}Uu6GYO6tej(D
z#su}53d3D&pq#8o;5{(*;RYm8>CbJ@QcnK-`IwfLz-MydY7FGOBp917)znZQj`GG(
zxc?X#DTB2?iq3>p+M>(*+f*k3y}b%a7;pFF`|rPM684LI)lNIvIcvR*9&!)?zqMKI
zk6So6aavqFI9Ep7Kk88I{q)=ijVB;@D?hgjGPiOvlQ_A?<Yct;R8<X3?}6ur{$Xox
zcdgfsCWlI7is#I)ddoVeNTcu7&Ehms!moqWUCxjf(w~7q4rF6fJJSLmJ(fb+{KMb5
z^g^ntD5B66&2i2L2_iI8%lV;y>=UPEk3v-}c#}r@zxMGv+3f^+u_(%R|FI>D3n5xx
zX__Ybe6kt~(@`@t>SK9FZC#nvVzt2XxB#j7x1?+7^wU)p4q8{Mok6FN{-Uj;17X;l
z5TgVNavYq{?>ZnM4+#pI0>3uIVFW-2*#ml#f%C(lT=olvedT>Q+=}OBXDOD6f87_p
zr#Ju~AZYqpnp=7e)WM|L#r$goa5`9mHO0jdudpz;UExEYZ`Nk1+Th#;86n6Mo!}^f
zeI(-ZXWvCdJ^;g}FF<a{=g*(hy5Me?m6^$HGYcI%Bvo@uOL0t2UI<Fzs0U^%Tm%6q
z07wS7U_AljEI9oF13!#E#Rrw{p0(l@RE~}y3<FHTAj94@BH|;CP3-B@ML<q4Up}PP
zWCv>S^1ch~IyM`A7jvAnvsO>a9@N%akn+&5XqL0ln5-SHb?P>;lvLrFO~{i7chJ)Z
ze~u0v8<mBOcwSR^W3_ROCr?tRtDMOM<4T-urjiBXJHvMN_Tr+GlS?gByTKX@?Kp|_
zX0+7p#XFh#`HjX>#kNE|+6V2OAsFwj>d%hM&lWGKwr_ZS$yhp>Z=oPh?zh&@K8@{|
z7TFMO4P><ZUMK&S&1Qq>AwmqMkTw=@s_<A?IAy(E`~vrV9IWOl2ga*ah#<qyMFe%*
zul7EWO!VRY_NwF-XEijPKs!F`56W0FT>ln*NaD1DG-Z$mB5^^)9U{8g#Ds<>r1S*%
zgYc%&+o*>irS&FF6By!wd<n_6qvPXu4prC}(%vZkg1eW`?>jS7idrKji^z%#@Pib~
z`C)u<S^#b#F0KkF^>B$fmZ4zX+Xx5?%U0au`!c^b_=5KfM6s6RaJs?j0WwGN7L+T4
zCT6tN9uPcqAI$!osROLcfQ#b-Eq)xY-6D`mh(O7{ImQSq-bKjrJVo_-pHX!b9Aef{
zpjCz2K0P<rltp^rb*Q>eqpqT|xq&k>{Bn3=g5Q31YWwGsj?NC7Y-?vH_*g(+#FLni
zU~nJ60vvQ*LTrOo)w~0flLGYgZ+l`!w<dsiYlnJCAv8tLW?P>+cXW?+HZ%J>mSvIi
zTg<T$Wu4MT5|fEIr#H5Xi$6YnqJW<b<Ny{;4Y{s9A#i>DGI6WFjKZZ!s;)Pw?|I~(
z!aVa7@pWEcK=0&%YyAMr{?dU<)0un3$0z0@q^j~+5qZWOlmw&pUXE^>L{i*6p5Ha=
z#M+!MYJ_G(fz0$4E}i<BY2PF!N}{7xp^X5xtLBHI4lOzhENs-&J{^XR@Bw*UUqJ*l
z3q5^Fc{y?h>P^(wA4#3rtZfhAP~BQz2RAh&2HHbk@LEtCiZfBs8>3Kao&RLX#S9_$
zjdwv;!&Ck6!e>BSChaElI$dv8?LdF??p@fVPoU?82wl$rYTawF4kaWcK>rvU2j_86
z+faHzN(#mP%&p%zC%7@fCcSa60n;%scn=fsL5B@~5!~U2cq{IM6w+O_et+=cOf8N9
z#|8$*NJWLJ#IyZ~hm2}gd=BgN*(NzKchYI74eV1J8DvZ3H0O1{KaeB^X{I6~*IiQ;
zbv$nb>%F|ni;BJ$6cG9;xqATbrcu~aP`*0oRYRG5Ze`_L2ZsRBW&%lj{mdLwem*(A
zcN~rCtOUG8b#AswLv@F16Q5pu<`(EYA0V_5rp(W6=9xc_|ItozudLu}>*m%z*9skB
zNZ-}dmTu{go(>uI`*8v2Da0?k`a*?#ztVjkz`L0x@hp)w80H&ZWFXNB;G6Z%7!rK*
zL?$-USIGdSl@x!HNZ<S0+8<ZUSI(-|<go+MxA7M+&LsGwrRkYiJPr;DebGl03kd>4
z@SpsczQrlU{&=D|1=nkp%x2Jt?N#ws_U6{k4#M+wdkVe+Ma475TQ;Z^o;(P(^+RUx
z=Rh(C7FIAgH*Ga<>)*W#2(>t{7ro^@v#<v*PHD^K!%5x?IR%BP%E~tdsq2EBzE~11
zO3eczS`}E*fER|GN`icTgI0`XKq@@+OMois2)2nR8ae29-!k<J1w+Ma6BBMfS5J5f
zZEZ4_<6NZ-*l@Aw6`?-V0IB?tAWan5R$^lOaLo8dZ$k#l=Z|Yyg@rf@P3#Zv-5Sa^
zdy~s|*!;n$u@SNgd7>jDBR=6*RdQF9lBA}jn3<dBe%m$BPph!jiVTu=rl<d*lz|HQ
zZk^+mZckeayuI(8oZKWA=q9~yHa=kw7@gZBwSR{)ynA~?CG-!Ai&gFH1dI$@-97v<
zjZ)f@Z;0@^tb~!%%E~%G2nH}xf!ULimxu3H@iCffD$+%-*7xU>VMWE6le^$JuYCLD
z{knGqbBc=1MY<irIFo@!Z^CqHylrF?{g^epFOTnWztMiu85#5a_gv@lN(m`XaVTlY
zs7gWcZH0UHI#F@kTiX^ahK6&HKq~ABon^a(P>V2(T{Sb6MKsMjvHl7R*%3d|HmEjL
z1c^vc6FD(awiB4Jn^7x!3BJo@L`u0haNYTS7Y6^4bnO=oytkEoSNLswGD=Qd&MEGW
zkyT?DDbjbo5yqAllt_O$``L5Po=t0crP-RqWhZ=fAk4z^^tTR-85Dng?>CUG_I#_W
zOVMQwW(t>MDD+AF4bV_g0UPpu(3=?mO>ktwJszUYWn?Vd{BS~h!Y`pq1D#Pt=IzMX
zSg3QU*iT3SR4V+)yTs_sI+~r=%V_&Z5*Qjw8yhvS6uwUm5zxrqxJF}abl8Y2D?0>A
ziD+D)4qV2l#z0g)*WkuOXr=AR1Xe>Ts<cLmRIDk8xd-ktoCO4RaPUJm9AKTeiGs(#
zb?W7H4aMK>IxsCEyaWn0<^hh#Uv~CzApQ_c2Y@(&c?y`zZXL{3e_C2LmdYxH+vU-o
zL1S52aCWxi?yR4y>m~yOIT<-~oGg(a`&;`|aMu8Jyb`3wfB!<JjWBYopH=H`%eUs{
zf37Y!NJx_Z9<kdfTGpEiFz<ASlHyHT;nU{KN=sYZR~s2NU7r|?X%}F@Wnn%;O%9W(
zX49lqd+BX%VVIRgd03#~<H7w0*SE%0p&%E1PS^q6-s$`IunwKSTla|<x|1Wl|D5T!
z&KZb&ko}&23+XxNcDX@MoU|HCK>f+gcsR=+<sQW{<Bu0piDs(y)x<w*SXN!FP<L7{
z1#P#A7!Go5goa*5)Y*!}_8IafI^vtMFQ}~=3{8*lP?p{2)IyR|<;xqXqL3-kIK`TK
z5wJ);g&L)PGO8NCGFw$w`;d}gkbW74;&Ir2&#tn)lKVAFc`h&Rc%hivy_sG=x+?3a
z%g#1|h5+1cCZTLDu^I+$1Z?jK5eQtEI%EJQ;1EOJGumzy2*V+B3u>=kU|JN6f#Vr|
zDX^d2g}tC&((i!m`N*iKp59*Hhrk~3bLAz&rE|8kV-F~aj2wml-j-$XimxN_2;I*H
z62-B2vK4pT#&ElShpmL0+c^xfA%+2tAOd3-Fcv`b51!)IKI#tIPrnSGW+0>@k-0(D
z2LR&_I)j3PjxM^;6oD2zpbIfn?t_9szvUJk%HPYg{UzujL*D~OQ-&rg9evdPU|Grx
zDfb-O4={j%h=Rd0Eu>rf=gKdY9)C9;HvfFBJ62V=Bxm!=B9Zl18m4OeBvo5gcCG8~
zprV4;wH?T~0hy%Dlq;upvC74SI(wfcbhx-`<HGu4ZN=}yiZ?e~Mct>$N3*tkw{hiK
z&&yLquH#&m(YcM$)#mn2*w~smp6n0JhbD=HA8#Dmo&S6yt3k|Tzvoo!T%ABjc%c)W
z{y^BB{P(No@|l^EJY~BqufK$I0Ka_awp5pPlT~+~UVl~bM~+W1qMk#@Z$Ks3m-f<0
zyIRP4Ny)}?X_S;tO=q;#|CJ+-HhWv2la+|F++0?Akq2tRrUPDJ#pc&>B}v2Q-wOjz
zR@oa_CQ^xTG^_KP(!rv9hv8#VW0%6a?S~W!X|rOQ>Lo&5H6!(*J6Y}HXSKZdw+=pL
z^gX-JSxrhklB*@JliaxCVxl7ICRM`CF?BIOWJATuY6GK&(3Szw_c_3!@w#$fT!8bY
z+5Ew$7JVoMfm-DNw)r^Q8rFzc=H?q<*fBRVYo1BZ&u0uMf^<mOBOt*LTt7by7Z|4@
zousu0W~<&&>d?>-^g$)K*$d3C3h<B*?t&}P%+ivFI}hsky@{zQe(3)&+_VV~33-%&
z%!LgH71*OA^rWS`o166^?hCq7pvzf%TLYmBiV^_11%`;3;E<`Vj*gJUXf71<kVq+h
z*n9#8%a6f#44}#_1K$jAQDA$Tm_Wnu7mO%FY(R@Vv{z?YTMBx<HAla{j4p_%q39h?
zSN4IG^8Bld7r8kB!N|Nmm^$XJ6{Sf1-yLt-L6xumO}FKqrqL!P-Rz!@_3Mv1O<^{8
zWC0jR?u$&CzI{o5r22i(O8h=4^OEuywg%&8r+;|;+8~wrjfe+-r2~EA&AG!=W+l77
zoa0R*jW_iUY_&Z++y$6%%BnhjsnJRf-eO<w1&LBkfu!E9k!4neqvlLp^^`t8yY{DF
zSScmpCq*??M<?^<1;$gs7#p5(#mtBj9sF64f~k!x#)jLS6!T2K<+LCDs#9h3)Z~o-
z3rp)SVUOhkxje0bUoMp%VS(3gbsqoeQ~s6D=V(RG&b9lFU<YPw6-5PK%>~(#pR(kA
zElG&soviP9Xyd<GE8@u&8-B6Bi1WVGo6qZP@59q{2xaW<>f)S1>=j?WJgg>1Q?{<f
z`i&0p1}*9UtPw<JW~HNh-xQHEk`uMx?3Ruiu>kXHf?AmyHrvohuC3))bpCSZ;XatM
zAXH?2toWus$dHn~y`LJ<!+ZwKy9t~w;L(kE4usI5D43P+|DF*b3C+&UJ=6H+2(gXm
zmA<~ngpYVIz91>o!;KP90Z-C+%O&LHi?Z|#kQcA1PS!ip9MP7irG7doV^K^_zi6Pw
zFIy70O+b({f>K&KEI8|YRW6;O^wvK3(c_M+%*i<&DN?k?GzP|e-3A789iiEQ{LRgp
zvrEii$(Og&y_Dl2S~3qNOiR=HJ;C?9WS#BXPIjpKQ6w+AZH#=lE<{6eWRn6OKVFzN
z`mVn<;dSR;VI@&n<rVK`ou0_{Xr4lt%lhrr%^({ya|1j|PubodEr~a~{}T#ZV}VlS
z-|XnEz;2etB*}oV-`-wF-2okC=kMgZDDJ(yysPq8J>H>DK2OPrFu52J#8>mRH#8Zn
zP1msE_^9awB&?SRW|es}T@3X8Nad>EWka*iC<#6%Hj@msM;$SdDOPP*I~O>?rX0@s
z@^=ZFjwtygyS~+6994>Y>+Bc%BaW<42WKN|afCQHpXT>uRzzPONUh{_8s4+sw>ILa
z{1zBkyXdR}=XuUU!&#<9o_PQy$x-WFtu7W;ZS7z3jK$q3Q#3KHwQyICPvjAi-~66%
zWF8xB5EJvtiM!F-)IV7;&}Pu=jLMdPz`ZUTRedX8eSfE2qIcpniXTDlYz_+E-x*Y7
z;{O}f5FdYb-?7^NQ&iiVx$gmU-syMP-Zl4P_jEO{&z<DoL^M7;q46=tl6TTW7n!Eq
zz3HM=5M7?>xty71;#yC?d~a8}&cDEpZ9IjuI7jU{AAjgmYGtIT6MNqJi>fRplH)z5
zRM15inmn??CNY^Sv+VtS6tAUVT$r%k<m7eN|H*WNm$Te6y2td)Z`SL2)%2)clkA^4
z7RDiK4cW^#-v|zPpcc*LV#~!5uX2z+W%P&prb4mJsu>2p-(YQ1nLyUq2gE4pV)gbS
zb0F*QkKd3rl-fQS1fOhtA^UCkI&*ga`YUGRp^wjVmv?+B>JwMpIil_7%lpW3v6w#J
z74rx&8_%NB8dEx{C2v*TjFbIaV_((Ure0N)zS@Ahs_UiRc2D0aSt6V_*HbxuEWc!4
z-`z*{JB3hdXDAXL#k_rC;Jm5+3_kv3vcMTI$<EO2T{1p@o^O>cKz7<B9Ix7?bu@Wx
zkey9gb-L4rig$I9M*ZASt7?&Srq&=sa>1^@%Bn@3hJc_Am#kU50Ack>Ooy&R%@^sd
z%NhrotRg?R8J(V2VaG^}*R?J_$1mxYv)l%X1f0y@(^JfRO_c7$9X2$Y*#4nQaO;$I
z%CpC`?v&ix(hwitreG*9C#*Yx`v0Npt)r^!wzyFR1nJIANOyOOq>|Fof^>I-fV6-h
zA>AO|UD6HG-QC@F7w>z%d(QpFxc=uD4%s~WS#!-fe-&?V8_#rH$?tIS_=~flsNr{2
z91e9m(VVh%GP-@?1?SW)YFDSGOb468;KlqF?PncNWR8^#RUH+6hn>k#-r={Wbpk0#
z6z(ax+Hkb2I&(qh;r~8z4s2^ErU=w26uz+4QSGUv#lnWJnoBRgy<%MwUaNU^q=H-j
z;TOfGNuUz(B|n&w6;+;ZtaZO~;49MPmPHV+B$b;ZL1h<ws4Ob--XkzuywkIqe4<~}
z(V2EysZ!B|G`OJa9wM5vA!|y&)~q(U3kPuIj@d#7CYhRE<JrAc#W`x79KFo~LirZa
zG*Mgag<({L&WHQb`FVMa!NHU1ov?+YXLSgd3FoTkqoGQwUmXU6nJe<urh;s{Q;S?9
z>x%E3G6F3*|I_Z;J5$?85hJ%(okvTtYBTtzB9HvTI<X$VN4DP>@Uz=~%SI0s*~=kY
z8h#u4p_961(e3thHPcrEWhy%`yg2GUFEJu(n(Q{8XxG)Fk+~^r|L?{fPJRA@GT&<!
z5pBV%-OR0$emXto`GNPP+Yd<mp)B}C8-}+L|L_P}c%QJWW$>7Mn9-E&cV}cIyrBne
zaHXMD{EZ53tAB5f2Z+c+efBu*Qr}n)P}u5#ts{m0-2dgXTp=lYne2FbM36cqGcyqe
zn*`ogQf;jQFk?MFi3hcwBd%XXRh7q9-!c*2Vv#O0!8^hQg8#Y3Y6XIV0`}V^n7B+G
zMuTnKGP1T1W~Q!+CRe{)ndA^jS<Obg(C-~23BQ&f=2BTdLkaCc+SL*}By+jLW?EPD
zL_}PW&b*5$Vf%=115r7joAYWI(41Rq0jYK-!cmEomQaLxQle7CQrzD>tm~)kPR+!g
zj-qsO3uKNzse30AHLp5!dKms(ALBVBh-9Kc#Hva$AMNV|ic94o;qP)+UIjHNK<B1R
zY3O7={UaPXNjwujvHI4%HDz1Jz&F^cV1D{lr^}3}g-vA}(vg^uocuH1HB`=+hhH-!
zh($?JuD<><5&#9jU1fDu(cQg&ESK47m%{nXNKQrNV7ZkGTu*os?k5??(`BI-7h3%M
z_Q=Q+^lDaTH}k+a2tegKd(KXd$d*ewR08vJGj+n-=!6>^{Zk7K?vi3Is03g9;HQMf
zM@P}jJT%iRgRVkxucz(We71%<HwSKh9qxt%zHR?TABHY%4+o=XKD7$zh%yK>=#-4;
z;<CJLoRx?_S|Z`Mg(3-G1Pb74Gwfzk#<0P}z!J(>FzKRG?>A|3@6Yqfe8|t1r+8j+
z@c~XZ+UlTiIU&riCvEC)66xzrw5-V(4es^Mx<LNOg&BC?-Sz`Zi4e<bh3EA@!Un;a
zBrnf*tB>B8)`jnnxMSWC7|eNUEgfn<8LpH8`m*i8tgQTSe06mWD#_-=#0d(K#z;|O
zV0QLnv9@hajvqGGEQvs^j@JBJaq(u!2&)FC<oyxRqw^gQ=!^gt2vXi`xf0`JV(c%>
z%HyU6&d>LZjI`&feiHIezS{l5vQ0`#-qscrGa$CQ_A`!YfkW?YNgVSM4sM*dM7Wb3
z!Pn0*jGpD*$lyXmCA{W&nJCdN?sk0~u@e$PqBfz^Oo)is&JC_6_pIRG#qjVhah&L%
zEO!(eQ5$JNSogX0N=Ot*LpQ|r24ifj{e!eH^XnK0f!Dl{H}T#=!~Z<Cj*jAZ3=Upu
zQo_c&yAH@<>b1vX#p<ZsC<YSW#T}hUPf#$PyoS+%0EgjSb*xqYe^|k|BtnVJ7hfzr
z1>?u*3?!mxr|&`%R!`~Z{O2u<YE#B-cC)7$_KFEwp&7rQ{54M%*xT&T93Qaw9Uk77
zBO4rw!w5n&T_5kK3)HL+4xl$S49(2<loWCD@(BtG65ZVS0d%{!J8_;O*gfCew06Am
zUAtlQa8VE_?W|BEdVuk)N`WGiP9vZSQxOPRg|4r%0Xq!G-l)F;>6`j5Uq(W>2(_x{
z272?<<c8S2pR(al#GDWE$p?<R$GAH9*-qDPKik<|@9O5ak<HBf@_BhUZNb%CSQ9hR
zJwQx6J(PS6>*8og$u#|oMXb)hliKkeu$)G+a@lsU=ti;V`O)IsDr#-rXhelc(rWeP
zob_Q4;-SE|vYT3j6aKuX10pBe)wMNhP6;@XN)E8Nl#!FO+EIO0e5N%wr$}U0InbjO
zrTlazb}d@e)|}GlgrxoGUD98GvGLzHd3Efu@{WIY*5!NA`?0^~y4cosz`0<m_z(yk
z@bOHtv$X*g<$k%<36Bc6XScZ81CC1~aI0YXo}T_0A8VZ$Z(CC<$=G5)IU1|Zac`us
zFv)gJ%jrlC!fW?d;jg-IOFW20l9LU?ozyy7$I8!zTO}e(>I@6L`o+2eV0nHo5AJ{%
zJ#}~4!9xXL+}5j`?D5$Bm#d?}R?0PwX}Zas1hdAfQ!)ygh=28D7!P@`Gf5zLX3krY
zaNr&!O0uIdXor-*p-8=7aVp9@yDG+}7rIOpjkMo-Od0ZA>*`kN`>K>H6ksX(TpeFx
zSQBP+^IN7fr(PwC4z@tbZTPrTmmFkqJ(l@^LF?Jgg^RsLH{!YI5B&;4Bg$!HyCpz(
zZ$XIY#$*z@Dsq0(1&VuM9%={{NRyM3y2HS43?B`x)O3t0vr8wG>iXse0H=(f%CoYE
z5i+oCZV!Ksm|uPkQhc0isF?sW5O!6P(<)h$<9M4kL$H5$Gu@(%X`0g>*SFR*phB1!
zRYb?c^cz<xs3zvIQ56+E*`2ssUbLvr4+xmadf%Qa125#xHPL0XT45YGRX9^&I=13@
z7rVEc50KP9j$KpEUY}R?*1FfyG-^wE_yf(%>BS`j5n(PU?s3sn?M_pE{UVBEre??6
z&(BvB6ubgb>Dk#!oS_#_Yo^ev+NuE;`@tLQ%X6~1W^iz8=HFkcs!q>TcyDdl99K0k
z>pt>#cEMhKLD=|d7Kqe(dDP9#!CBTFAv_Zk=~IFrCqsKQhdWn|j2e=X8sX}uKUKUE
ze0u8gKu5>uNd|<<t5cv)4E(jm$U`t_ZWPX)$6C!m)yRmUCnm;$5h-PM9m=_XD@ptE
zi`4-1q_&D?_l7N{nD{ru0Qln=Wz)*;S0xBbF@Rt+reZOSD~D^bGxijk_CAERF5tr9
zt$kx6Dc}%-KN~H`qy0TSHa0duh;IRiM9nE@z}*8Bf&^jj=KA_u;j0hQ(h>OVRD68(
z|9nk<jXD$G)*j=<$z`K(O`RPW8|BZ?em+oc@g_1Gny)$9?g*CN-*@Zl3xW2$bE>H%
z1Y6$ef;d3g2ux4EoGjW?R;~x!q}ywx?-5JphK9d>@o)>b{8<!I5YP7W@u3QA2TMf{
zlkHu}2n05EZVcSi($bidlLoNlrINuTYvhRB+vTycLNC#YQP&{I&ec^?J+wJ~qwc0X
z=HL9JHT%`Y*m$iXufPAq^12uW`QaCeljk8}^Gn!fyA5h_E{i{15l?q3>%HnS1Jw9*
z^d~B<K|_Gj#z46N+))PnyfA}8R+D%A{nahoqb}Hu^$~F@&p(NQ1EGuX+%9}Klgs=e
zlF|Cxwe8n>o1E!<2moxbd}Rszq1;7sgID}JR_ByYyPMy8K=JCnO~<%Oq@d}+FhowM
zXl)fUZfr6Y=Y*@VV&mPoFZvsn*LOg%6s-!wcab18o>N|3oGt&v3?<I_gafEmV0%dn
zf$X7N@jbn?*#Ya81z>q<wH=J*;`i9nwEgVv$)fiM?Cb6@>07-JPE5(FW<@%KNaP^6
z)-u=Wzw_2qCnoXpog1+Qd0A}}N{)KGNk(KF9O?N51w(tkAQ4VOgg#7_2;10|XN`0a
z3wr_Usps^!x{4})f1i|;gpiQ7uP>W^$<*?&ot~x;3VYSn*H?q|_kaNK(Ke&w)Xonr
zWlS#buChU7mbnSdC(T5}1A?@#?j*c+kF9?5Oxoi_gbaWzKKBWLEVxHqJ!ZdbO1W8`
z^3q=DzALQ7TLSSa5Z2W*MFIlvl9nc8)XmMyUY#GV?Cgaa*OOr(ZDeFZ%qek!k{^J4
z3h@9Qu|1Zt{crY5Fyyg<)`Ek6@f{_VwyUzsNh^D{6NZz7hDy~1{*|5fz_}QH<pA7p
z4@F^UxyNm?)?hdr?%`Hw8QxG1pGB=LDLgTD6TjykdZ=8*#FTREPd@v*;Ap%(k<Vqd
z0FjYxePfdCAHG|-Qr)#1;vw_Gtc86oyt4MUxxC=i4Wz$-*BHoV>E_xP_>-#NECNC$
z07nG?T^pnt0qu#2$?w2GxUxOat^yi1U_kN!dvSn!1205?tr%bKe8>evs8<_IjEob2
zOA25$j~ljm5<nvdl&<uQ3}3n2yX$M2csBpzt^NIdV7vzN>J>090s#*py<Y&i#70W=
z9)h+BXvVrYC{zPIJ%HACadzfc{$v3p2JGV?s90wj05&I;reklXNMD60&_r)<n@D-w
zT^9NHv;s_>kdW{Vq*8#5?X=Awkx$n6fpk{DQmb(VL#cbiX=i+~zt5zyDmaVg3^I9=
z!)<ZwKPzLo9h8HByN9Esc0@$5yK^Lr(r)Ho_V=}B4%erX#Hms|7lSO05ABON^p2vI
zl>y={d$Zpi4T_W-!!CvDRJ5S2VIY*3_|#QKcGT2sXJz7I$4`MMD|_xQ0Zbq2?s@%n
zVtl#12OC?MIVSqsm0f!>QcQ^R*S#Ija&CIWL>}~|-}!A*r0m>Jr|GTJN=jwa^2eXI
zY_x2^n^v|N@QD<ts!84h0hzsc%p{!MCmAzY<g0pj<27CSMI<^y5N)&qiE#S_A?|nk
zG9k-oVUud*Q(#m(^Zmd$f2V<CylVKZQt}~bnX2{#--q}6wKnD%nXUsaaS-`XEnKTz
z^WQN~_1`2&Q$MZe3OJ!hP%G@cDaxeTGRxWGur_d~y(G|8gEP=>e$g_5{`v2pU7)#7
zDcksRoVK`j8=paZ6uSW!6oAV2c)M2uXbpg4{_JB8Oc_m0O{=S`OU=O@3GCbf70Joo
zes4Sv6OU*9l}??V|L7MDZQyJPOgmAC1%9QZ$dN^!UtF}hgLNk{HueOFXYBzL+I3Fz
zNwij|^v9L<K;XrTeLe^Lh(P`YC=yT*uv`tessk81A#RAMB|v@<;o(2cI<WqN{j-&x
zr!Wkh=XIJr@&IHvglY*uVNsym;Lwql-UEBQADE}gs;U6SJqKKi*x1-(LP?Cz?r1@<
zvpFtjufQ~|tD{{&%FI67H$BxHSon*E>i)g+T=h*#iq{iqKzwCzu&GUHZf>si-0N?b
z?0Uc*V}4*D*D?629*&Zk?qo@pqtleUD*JyuA*L3Uk%b<#Lgc)LSCVa%59r$OePX<|
z8&_aY76R#hrl}|eY6ON8v(R`P+~wzg&wdHVyl|#f*3=L{hBeJTcqfY7Z8kd#1Osk7
zzbHxSc-$_E=U7QAs*}+$(=Ah3&4#+x5=Sc#Y6e+Pbc-`7QUyAjqLiqaD04OituvZF
zT`SD><rA#_aiF6-^J<=p`cnOxe9^fyxeTWg%r#HUs}Z<I<KxNzS_DL*x#?*%T0r~;
z${Ijg(3?tY1w3;=Vt$iB<C6*p1M|#bq@m#`f?=pHE)L)J2EqfT+AFa1l;rr10V=A8
zk&#h2OVAKFsDP3Er|aUIAU0_9b|;JGz`+E_9`LRLIi@ku(J+!Qb0EzctZXt{rZ<6|
znaB|M+xgxUGY5wQSbqb(NNiM;FW{$(1DpmJGKpc`0{+c8u!jLzTwo^-h%kDmj1Qxv
zUi^=;jUuwV7h3u_!`5Le*Obt@(R+zs5fM&a71L33rnl@UR9GEQ%!Ike%i#F?zW)&7
zd4-ufLc^?s6&l0>wxXJ?Gk1*X3R0Y$jsO?a0G`Rug4Kx;z>`#hNi7PvB>}P!Klr5~
zcT=3|uCcx!zL5xvv5fc!COO$XTLxXNtj?#pLU-l?<&Ed<zT8}q!HJ^tcM!9MA$qo?
z#@%=0V#6D>1Vmq|u;#RDQ*n6<OByokMy5NfVH5hhdHqZ)D5vum&mM@tdK#z;!r-1L
zBBdM<@U7m{a`cTZ?63w7>4u|3`^~X=CDvx%u607r@dR)z>R1hdz<0S9@P0tL5ZEO9
z!6F0y9qcm1^9x`u3jixLbac}JAR}}Kwwx?mJ8j}xf9yVI0y?9OqaGmqrHuiE6ZR(w
zi{96)taZSH2-XIi6o7sH5N2UvVPr%Wo1L5E2M+~gFgy@UI`{+v_f68<Y2aA@JuD1(
zr>-n79|L195QRcROABz<&TdQqMEv-jmj`c&w}j@`Ju`zpi`3Vo-x(6?wSd+_u8U<|
zYQ^WTzo1gpl+|JVs4bcK<50le{^0%RibL6RJGBYBO1{3ZO19Sy`<)ry&2LD;B&Md+
zj%HOVnji1t`g2Bp3kZDed9qy$Dn*oKcKX6rXUYIOec`P#ou96lH=Nq?LB8dmedHT7
z;TToAMd4p1p<w1_VqiKzukLD9pIaKyk}MPaDUq&_Cjt+bzq^MJ(l54C>BsJw(n4R8
z5LwD<@~cUFinC$V*h0|Oa$`N%{Az04)(G{;c>dG-ZxZNnINayjMg=A4f<tu~^{O(y
zMccf_qyMZ)f!RSVCSyTi-9_Q=2De-%jMY2lC1fd{A7x)M?A&PqXxVpN(Fs}9053W~
zmB>dv$u4hh!U_Q&p%K6+LesX<u(8$$zzfKK%KBrFs0w^T$t^7pzJEso4W<l_%>$qs
zQOl=%;b17)Jvst?=p!g@rMAak{$2-na0noQK#8*IY7iJt1Tu&K3zfzVd~+bW2bi-h
z;VC`K6@Xka2OeDAC>%zuzOMk=2>dVr8L_jld|iD$W19fs5=jJ?ZZPr1?yi~11gPz7
z?ChjX*}B{vM?JWmkmjYVnL=nKNhg(9>m69GVO`Ttm_JdZLLT=Va=Ub_UYVYxy~Obw
ztnYar<jG21Yh`zu6iH5ffBJEiVg<`e@n_(LtRhU?&EnE*!P4z#pb9EiE3y3_n+M=e
z(WR!89Tm6F*KjzFO|7IY6t|>O<<}TJea-1^)eSpk@5=W$VlQzHF|@P*Th@%3-glCW
zn?0^1bBBnH2dFyD+@w|!Dr#XUQ5HfHA*s9B?E2wV&6^ret+g(p!Ri6Ub@7^&f)--B
z(6T(Lj@k>AgWcuCxcUA4U*^3xt@2vz3Nholt;v~3QyCWph==FrwZz4<%Md{g0*-qY
zK<NhPWy`?xJO$v+CWPvs>p>m@Jm8F+7(kHX<K+d`T6y`apDlo@?a%bs=;-h92<P7A
zQ9$zsjK+G`vz)RrRuh}G)j7Rc1;}<yQc@D=u0lI~MFEpZ2J}P#NB*UNClva2XzQ%C
zX~yCbl-JfKP&KQ-pxJiA>Y<Xb4Gi``lZ8Q7SW*H<T9Tg+=nlCGG(TGY{hC^WYqDtU
z@32tq8M9-p^v*hn8nDWFFG5$5Q`6jtTLRh-GRB80YJ6Inctb^=KB(wwci$apwV4?I
zVnHK#|GVQJmb>fQ+<HJD#b-j!*4%6tWP=WZr!(MN_!5_soBJ+c*^CSkr=zR;C_tVm
zSq`{7v<F;hHu5wM$`Dw`0MYLA&iC1W>{)@p7A%(jV`y=(>Gry=@K=xCS@8|n@IFK|
zoxKPpFom|+#fi;+4Mq0)H&&<bYaE8xQfeJc2toSdFVSD|lU+guH6^`rmn=(izC8Ri
zwD=K5JmkEbvG?((_J)AGKc=^U-lC*yEX7a!sKKGe#NI3H=-L_euLn;aF_D_;W|O7O
zW8bU%>MK94pfQGBXcrB(_P`=}STzmQ-fyeKf8SkL937Q9T0h85$QPi;gvOV}<cUpD
zve};XV~V9J3cHvY_`;(~6-Bk=a(^zXDs3PrKetICa^JnB^Z3>bcv=EABuG`cT5g3l
zgk{!i1PX%|=c5$B=mfW&51uhSd&8U8e~sQux7?qGCMM41x=({xC%`!Rx@Tr;N*a>4
z2{b|*S4q!V$jbM?v=(sPpfzD(U|z63C!wgvBqV@;Hv{w|rG(qIfW<^5pF&7XjDqCa
z5rht8UO>_dw3dL(PQYOT`fg6sQHq2d@F7sSz`<h8xJGy}-O$wZ065@MQ7{+2!pN2D
z0E>Kk-AXb6{3=sDci%kmniaUvAKCfII~ov$DGU!m?`xRm=PTD1d<f!x>2!Z?3dHfO
zTwLipz>t;!6Vvm}1SO?7C8fTg;DWEO2Qu=?Tn#T<{J`|I!spqvt;iO)3kil;4s#;}
z9qDYc`0qHu9og9!z-JdX4s0_ez$(~syhmYWu_Kuu^=RI;YwK)ubWvQ4qN6)7FaSjo
zZDd$zdIAJBfH%lT8BU?`6J30FVn6Bh)+JiBzR7pl!LScs_79*?eEhuSm)3dYbQ>7w
zyL<bv7;<R>KJFtFwtW;=#0_Y|Z=<wb9?|p2nP+6NEKSkpeo|nbW&{*qjd1<n-<xK%
z1LYKoaGwL$t*wth!Z(l`1Kb}6@471t0)*zRH&>^pP-~k@-cQ<fhVS0B0saL@y?Whv
zH6{qOdtKns4-Hv?8V+PcZ@+`Io)=nEBO~bhmq11WRt-=nNY#YwMo26x0mzV$PJJRG
zumU*T-rEEJhq|UFKP6>0IKY4+q@W?las%K+Xk%Ee$4`DDh(5xA5Tq4Pfy8qUS||Yl
zfoS$QA3I>$52hp*K0Xi<bO9RW>eA8|{rnEwFS<(7ivRO1X1i4M$$lI}c7SuXB>Fq3
zs-!-X1;GyH2E~fk?Ofpa=0{A-Qk{K4JiFxV>}?p)95b7pZ;SV9n;q>2{i4D~Sy_lr
zKX?Tt$tB%}6LAsT5>M7}7n>h`y?JeE8K&K^2?ryVoz=KvBjxBg@#l{Ya3}1tMcOxR
z^_7C5RhfKM&y%*@9v}`FIro=yAEMSh8eMhvd{&VGk?hAVh}=L#J39WWaJ&D3<%L+Y
z=UWA3R;-(DE|`?Kcw<Pl#}bzC9+#2co69T9K9Z!nQ@N)y9K*QYNIv$%o1mP(9h_NB
z<|VUqi(kQv2<A#5??)|p`QfD{U0wq%3Jf3~8Ukf{rv~&fS%BY6PyX8zp;ixUf53Fo
z0>apf0D&|}znSmv^Zf>Bf-ee~Q`N03ERgO!0D%kS3&BAxRa?%1VdEzSWN`<C%m5h(
zG75^SocS;y&ZDBjQ7wU2WH=xCLkln_KV^KW7noKIRuz6tFMGkOVm6UK0p5#eI=S;<
z;r!-!o)T!90WI~G1DycIcbw_uk>mX@@0w}@VJi3n0qo$KfC%}&ds9cnTD)c6PnVAm
zxmdWmfC+pWgw8sa3pbQl<GMA3CqLYZ4AF4f(FcrR!0p163{OgoJlUJ6;HH-QJ~+_$
zEB3OlqhoKeX#qrYE><6OWJ^Q7z4Ixy5=4zav$qFg1RJDw;ohmDdgI~r&+Qr3^HP<S
z7R5aXI+9=i`_1$N>4%R7$GBqV&XEA&pKg4K6m4}V2?Gz5Pq|h~Pj985yw95dObuIW
z>aPx4_3Hx3UNNzqogD)M14vhjfo~%H@$|Y%qA`ZFFXnyQt&>Ui-TSE0zBAdAE}9QU
zX0vmHMFvlgLzq+y_IBlAp`kC_1RS=%b#!Ec*<pWa30mZvY@!n~A_Sx!{T>+DvYfl6
zq>`|na{^~+Dv?V(0#fr<Ny&1lnNmdLOt)o7z{w(PwO=bQPOUaO`^5JZCN}GYkg}SB
z!gO)+QxLi*8CjP{BeIU<b&gBKHs>OA2=_BV+3N+UC8~w$_!t=b2G-(gq~GyvtfAWf
zpghJn(u;obEQ3*h6=3*e<;+Ql%@pr)JE^C;bTC)_uwQ*IE{k;pWYo8JcR<bs*4&fA
z4UA|#y5mqM;quK`)t6thP03rVR7rI|V5qt_(=O7qvrth98nW$_er|$AaWI|87d&2R
za3;CF9RhiDP7akI$@8V}Z%)fu!_^K_Km;33S*R@CwtsM2y}f|rvJxO96etA_a)3Ji
zrx<v<fP@nuiTWU(D}S+#Ze(dmjLH7dR#S7P{fDis=H~Fy{i=9WeSKGVnz`dQ)K%@I
z)ZF~^Pt6<-N_K4{(wgsXQ4fsHDu9tC+ZU;_?}Dxggkm*^VrhZkTqelKVpuO%;)T3m
z?;43{yY&l_m->ZA(0AY5xSF>qjI*KJ%JLe|&r6lL0U?`0(f1EJVZo&8gBXW@7cnMZ
zt?Lx{R}+9zzX{0Mv(=XUD>nJf+^^RqKO%Ox@6Bk&zY=dJv~b|~rZTT0t~B^+EQfKX
zV(ZJhwl6lM(DuNo26!$XwfU0{4S{R#3>bfFDJTR*Mph+j@A)HkOG*^GoRA%C4q8&n
z>!2bf2^tU88V!CwTyRIp@;qI?j*hM|9x7Cl5W2cbb$s&bt9Kl;v`C;;gx3d>W@Jdc
zq}cUv67S{Dxn)NXQDZ#6Ai8-Z)qK1w=tI|hA@R4{+P?fWbbw$N&m`|*ip>oJ@3vfl
zOm&`Cfz*av#kV+?)mL5O#;J;6_gMFy3JSbJk9XsECtgX+39jUtVI3h^0~q|;n50K=
zhmCBl6F*SP23dK@bMasu26#d~hPbTZb=|VJ_t+v44x_fjUXAtfuh?CNLT|QfbjlxV
z_)9YhIxH6YI7K)ON>h%??_t>R7DVXsx~+|LEx)|@cFz~3P&f&v=J$4&hf}-pDc-Jx
zPMKm6PgV(p*@Jpso{;dt=J>6w?8<6qa3Z&)sTqOALOnpT_|2!7IN0rR5N-$v%rrE1
z&o*DhCvZuiv9X@~VQ!)DxkMq}=W=s+0!-2D+?LRA;)24<t}wW_oeK;6k`m#H^XFZu
z!rs_8I5ifZg6ir_i5eEdqC>C#xI!+qTz&1}i38s#AT^4MM=O&>l1`9UJz>1W-70k^
zMT2K#{`vRm>Q*6o`3IJ$B(hC{;NK17rwh;vDw5~1u6kZt2Cw<x!bGHrjU$uTLz4Rf
z!itb$(5+FzLk-IVODfJeRFJDU5k7RfQW}O`RDPB7!(|H@QBfPDWtvdl$Tixqmzo#L
zP5G7~QWNxf`k_)rNaWYHDJhMjf+E%W3EL^6Pd0acNgg}J>~Jezd5a%kYJ>;jP@YXs
zfxHg?um<ktgk@GiQA!Sh3O8Gb$VtFkGj8x?y%}PpsA0pS5PYSCduIog1is4B>ny)+
zv!1P#{<8izlKcS+5+uA>vgtifd`^!c-eI@@Q-h~3+8|X-0lEbN3o_2RC()OiklNtN
z9M?sXWkm6Cax5-Ej7K8w;%q)wn;w1hWPZ>hmh2~6d&-op00qlX1l(&ZE<(OMb&EhH
zViAiv>(o}9jV9xf#Nj_Ad0oEz3Qi`$`!Mt)MEvZ__q4kn846+xht{(>UR5F7!3NH{
zO^6FN>Pj(Jm!2b1Q<ri@370`Ll_tM)r8VtKcv~E-Ee~gk<EKELma43-;bv2hi||<P
zoyio6X`R1;8WEawUYnwA?q-)QxtmjRd4TA0wtk+!k5$QKEHsX_(uVySwEML~h3euY
zthrLfsg{xD_Cm9`SKf4Uc5q8(n3M+f<|Azf|7f~#i{)I+6DW|cK)bds`Lgp)>8lG6
zXwtvY>zL}iYY)&Jw)t2h9(svMg@n(le;!Ack?zRE*>&}&!DfZq{q7^PZZ0E_*Iq}F
zq`#p)Hj}Yl++=^?#r=%s9o?m*uI@0-%ehMyWm!9}-vI=fS4jy?XH$5>&NPeQl{)o&
zdt!7`?P5Sf7uj1ct8G4^F3f7xHceYFEG*?(YjnCECRvlz%-PrU;Ol8@)#-17pu+d9
zgz#YU^N`K(3H_zD;6-;6_RrNQrlh*mQ$5iyt0hyusw8y0m1&ND6&<k`Crd?W0`T_C
zAvfLEyt4!NtBO>SKKUZrP{g@QzLqsQS-KI(ro2`=_vyqh<+$_?hJ~Ofd=#{+%APF3
zoDL1BC1PW+p5|8B5=`^eh8sLXN#^LlC+~@X3UTHI%=4=9@>83dz1azk+osfGD3!+y
z@nE7^%~?mYhXww;wI775EA4S}&PguU_Zx-nYD#S7iI80wo=SVGg>z%>15JnbAL>qY
zI(pF6phe0X-gA!g*l$_eU){rYLO`i%hR`fBEa6y3{UOOVtWGZV`1mz9yk4)j^=wV>
zpv!F<TRQ82x+~Xc^9hGNzaqQ$=eb4qj(Lsjscku1!v)8J(dub`Xg6x;(n6IuCbxOT
zCw?z`O9Nv5&tETkUV#^m3I9M_RWT<cXLg#IuczHgk>pN466P~`jvl-u^N`HF<r9Z6
zrgo8)k{UB)WNbLyG8x@kq~fD=D!wBrK|(+6P75Q`&px;63IR68<?d<(a7UkJf&>~l
zAZ-JV@#B^L8Rc|_130g*+3+cgAQc&pBx!gbJ|dClAe`CNXnCIY=Rp=UIS(d^<CE*q
zeb$5<9CbnNCb;OwdD2`elxY^_$0{tTnXEr&O)d0U)K(Pv2MaOA9`bneA0Bs3eY$DY
zNzU+GNNFB&`qgZ7m(?0xJs8EjcoH&m+Fkwwo&ItoCfaRA1I5W?EKgkN&E%v34u7>(
zJ;&dIS{VjqHO|Y>l;sJ>cd5mllw?Ta;%qb5Re_gUwVFD2)ixxl#j1`{jgm~RGt)W~
zS^i14xZZ3i;Y1<6nd6y7H;2ch!qqAYjif(FUSS>CY1KDmHF`%m7~A{Q{&q8$fRDO(
z|3M;7UTo`>hvHOChXeoJ`;Mqo&u{tgOcCynJrhAX_WcNF+&W$u7%iA9X9|Qw<Jp5`
zx@w}9gZXMFPoUHOf1d)+M3Vk5Qfa4k>`<c{2RLC$tcg+eVL`LCl-EmbY__R4!SC6j
zGzfN_!qwT^vKP{dRogKyO%PSCWKXnX9X>ylOl{rP?dHQDH4g>h?T&!)OUZm}BeMy6
zm63&rPfb_aa$|atitl1Q1Z_LswbQjT8^#AZ%ptz)bLTIemkXAA;Vzor)uR`)5{t8e
z(av~jgy%PO(M}9}^0?3}tmmT9g7o|dk{41Mn_A8ecYTwrL%0(cLXaD2)nHzVu%5Rg
zZoCtZcH|{fqxf@MIv8k&*YGM+D?_V@<`tKmXEasJ=%y-_MGi7=+lIZ|L6gJXyeDB2
zcBiZ9;^KH`w+>w~<^#(0SqWha{zk*OrxRB@Zc6jn`r_VzpxR-}fXmX%=(WPPmg<CM
z<C+u1x8P0yl@|`nIu^mN03A<*CTd{8e)sk#hqmlQJajTij-ih0xtpPaHH+CNr)`fi
zb%C}zzxHv?u#3IkFO~5=a^q=-x3dHoc@{U3q0~<veJnjZ@(rX+2h_#9dXuG(ueUDA
z^1cs~qUw61RjsX$d&xENS7&WlC@Zp@wz6U=^{iiWzHgSB)MkYE1w8DtD-e~oPTb1x
z?9Mm?JLnh>UEi{-?-{pDGq#7V(kwyWKns5#=d-Fk1mZ9%>X#qXIw1})Fo9^;p{y86
zA2&ijh}#j6*7At<wi=U&vg$GZus&9zi)7_!Ux{Rchr}kegE5{~mOXzLDo;HGvsMee
z>1WQDAi_`-^~ut_$KxGncqlj{l!3FJoTZ_|78$$O@!#ESE=)__ZvCi@Oj}I#{E-iR
zyrnDTWnX<X3KAT&wY4|esIALA*PNHUE}RMz7xE*Uyqq8{FaDY5^K3!Q&XeI|Wsy-{
zvHz;<H$ukpC9O-ZN0Cgwn-Yo%j`El~FJ5kJ>hSw<mG$Q~D$__sJ&ru2*sh$%Saup;
zs2QBoIz3m{XuIV_m~iPVMcM~fCbVjXRK)S7aCPoe)?{_uVWEacSw2>YAs1#DfxBf&
z9>?dC_Qa;p*q^7+IFmSM_eEZ7G&PR+ja>(On){yI;1-hhrm$g}=MhRv$9|t{BEOZ_
zpV1#x0Xer<Ou}?Mv@iv;4H~vQ>nE1R;nuv!>y4R#mP>CUWLX=0;*GO6UqByC<{IpO
zbHod%L*C)?Iyo-aN6+{n&y?D~VOzsATVp@d`fW+S5;AVE%@^5q3U+Wjo=5&uzJo@{
zj@C@XPdjrL!chNxJ|9DB?!OS?y&%ITL^CWt@V4+<vt~czlPH|#HI>@p+I+n?aT+46
zVrHrRPSj-FUSE>?$UCMxQGa_ev^v@E`Hk2DZ-kuxDx-qZ42yer_WIt)9F?|^mx26!
zWZ0Dc6@#$H#x+TLS!Qgw?tx&JfkSjj#pE9e9N5;-Kq4*oRgck$q8bbxL60U1m%+-5
zRzvX&*Gvr6q@)IG!_zuZHD6p*+}{NW)okG?L1U)s!`6K-49)^VsGB~0^<Hc^#k=0R
z?rbN#bBYama&+k@=$q=e)3!>tA7G{=>-Fh<v`pl>pBcO-{O1-|T>H8@7*(n-ZX(rV
zL1K*0kALKj@;W@DldSjlr#I5}XgP4M?i>$>yb>lPjet}=hzXM{5zW=8G(T65teh8}
z4N2Ca+#wk}!En>x_|A7YEH=}67S9mu5LEAl;?`}A&UIb3PBT;5>roR~6$*S*m#BQ2
zBpdF;&)qM#`rmZZve(GK+&*w^EW}Em*h_S3%QRQt&bL`2UKutIeX=p(3rN$C<e+*}
z9a$%$7abu(rgZ5-!=Fo(I_oZ^8-rWH|4x}KkJi91|BUlwh=Im7wSl(gx5IE&E79P0
zFDG{4Znvj1;R^BPb6ze*!G#K2&hgj(+|T(^puMxVmL^2WK6d>kCAL8LW%4pG<O)pU
zetv!vDzzME50cQLABEzuq+9VYG_IWAExU;1^$ZU)v$B?*W~aG`zTtdDv$)}***L>O
zqCYdu^i!l{ygbw2wnE}iwBB_4H||!P1gf^v(dim1NC$wuP(VMu<szCf{}yU3*0SZS
z=&NlsbWeFcz8Ja{RqT9OnYX0S-@eq_)23(pH$SQ7#@<Qx9eQ*WD{FhK4-xFpX}Tdv
zN->!f6wU3ZtvROLAZIm8^hB6V$_Kz=y6HT$=ib7JJ51)t@|*EKOs+fpcsPea%j24}
ztL1&>HlaZN?;B3W30&g8#9clKYX1z~dcSRO*u}Ve;<Y@qhKbQ8D!Q|@ik^^Xl7ABs
zpeMTa=A2sI7L%uK<Nl&REMz?1xDg#W{rzz1B~#O~vi<8URVeWFBjhHWskovq+V_9x
z+dN42?Hl2yJv#rsJA4^xIjuf5*=yGN3)@yQ^FYf|5ZROH)(JsJz~jtUP;={G;ml;A
zS9|NSL?Ge#PAlxzEH7dVlCWKr9RDjhh1=O02F+SSq7zATNNK~DOp90&eE>6?*3H*l
zc?_ouGd3c+f=!1V7FPHh0o~#R>VJPf!%6`)q%Jd>&Yr3pir{o%n=KOG0!qf{GW)?6
zx3>zN+OJDfKC{=B&U}i7cqdH0`uO;T-0Pu=L;)d^yszxL{zl*c6CqT~{dyGwT#cE<
zo5+uSlr>6=6e)F-v5I=3Y_!<(6+dN_{P-|*Wq*@$Y~#+%R6PIq|2&V*#5chMmHV~u
zdljRL1!f5QP5c~emOS3$eG|D`@s%c(6ebr9Q#15(LkHGEAsHLDVU_la8*l@7(?x1<
zA0#>hpX~Mqv}G-;aFP?@WQa;Vdmq0?7vd+r6G4T!%^Y)Qq7t*ZL~y_-I-p0FhR5)}
z;9(r+q%$*D*Oa9Hpb=>0Os>#KkNn@q@l>MBmd_j5aTeOITubrSAd<`wF+y6AV{ey(
za?92R_7{72%zMK>O?4D6*FW(-HKvKlpwUE7&~QmH^u6jdv{*Fd<ovvrKH|f4(B9P0
zH=VdhGtFCi_-Dg?!7{-|FfNFKwYfMsA#oi4#3(aC^m(qM2i5-N>I2!G?dZbbtJSs=
z3txSnImxZ&{%&}!ie`ds3p{*TJNQxwxiL2Q?4sh0&zno`(V+$M+}NDhC=u;{Il|Y{
zylMOBb12vxi5v;LO+t`$-|5w6xc|)LQB_pv+lZn_bnrWfL952iv_%Of10FowU;Ecm
zggrRh+?lYBtfBus<sdyP7NHX&`4L`RURuC*&DG5_y|416+`E^|c4s5mK7xsy<H^kj
zVt#IIhn54aQN^)gs9JU1TXs0N4K?i66p<fo1DT2H9)JA#`?2@{4lE;DE`J1kMZFx$
z@~L*$41F0|4SzE)E-HARY>B6V(&65pm&ocamtB}>x7L}cN`?F<N<VVlm?Yf{L<qwu
zHh`NUJU4JFcqzi!&Yb#T6MBLM;T!@@0!%K?(XdpS-(rsyOQE=KY?Iu|UdS*I^k!kT
z2hiTIpoTV8g}p&iKer{Gq;5)lw-YZ<1Tn0L^JLJ;{bM7*On)?!%AD@v4W^FgabFy?
z;Gi&Ji{jk;onDyn&+e^&>MG<rXR0}NIS*<^td0%-Vj#$mq*K4Qe4Y+g)|TzDR|n&C
z^9|BHx8%J3oyJ=casuxBmw){YDkKS8PFx0q=}m(x7$g|UK055a&2@W+ok)e8_=?fK
zic1@7`K1Nnch_q|yWviL(KV;J1cH#Pa>Yo8!P>>-1kPge&!g#}TvmY(w{d>FZ@YiZ
zYG4oj;UsCqxqZdfBq6<^`e!{-h(ShF`PC6Z+W-AqyeD*zkDs&i*k7b=zGNhD4%R|0
z<YAqs7Hmix{N{oro^!s3GheY+;9Q3?HffP6(2-tr6Vt%VCtg@z|Hb3)t{?FM1DQL&
z+whViS(J`0d)~qC&dVYh{qCzcoH*NWbg~}C5m)<!ley-$d3YaXOuQ2{T@D{ZeY6W*
zD0uBQnsJX7NkR$aR$-LjhW@$roS-`!2-S>hqLaD<k1QB}V5@jq$H1uw2%_k`Sb3^c
zB7L0{YRB?TkeN-4Dm}|FZTRy_RMJ_m8nx@7&+3Ea^VNYSh3&ICcxrjFiFWNbb!0(B
zRmnAJJ^Qx5EM^A{j$VcMN@TZzwBJc^xMv1)XU}r#ws%kFc|iPzY-eh2dw6IcXlh=<
zgpXV5VI8k-wlJO+7W`-v=LCp!lbJYtRaW&B|7TJ<iGW=tTI?pi521Ni@)0XDy)P4=
zcZifLwU*1EyTrRI5b>Hq$zR#{`8Gq7I)DY!F_4eb!>PbvstiCI8|!&)XJNe*nwi><
zE<|M?-bi)8OpT>mN<n@Rk`YlXY-o}r9%y8D6IiE==d)qFNo_53ak~RS>aKU4bG$U`
zR*=!gBmJ{%76Z~CpAjYWf9GOCQ6M2nrYhN;%(uO#Il7N3uc`9~OQ(a}VyK~et{NJq
zo%{BQpRKQR&!Q*#1Fi`7E!ZtiiltF?wRPC6R=e7n7dH+<@Cqrh=G{&v(>>27ea?~#
zWeg-Q6+@i;eAwX}Oo;?Lu-m8c4?IxnPS!7yR!4=17(&HIKcBc0Y9=z_ANG#>-z_{h
zr?V(v2BITh{Nn?U!92v=`Tku~ikaz0eDM(hAvhEE_BTkB<gQw8hAAS*X|cz!+SN8_
zm2B)?2uoRI#|UP3F(R?3S@f3Euqd-dYJ@%iVvwJ~`cwqixDu9<Bej3QFcn*J<+VO@
zLvkK+k8`)!n<<@s%dIPEN+h+7+}RNI@6baB)tQ*xwrmFOoktGAFP@6QQiXZ>7>jen
zJ<|cYCA!)K43g_j`dr+T6A4^BW#;P|mZTBGYrccAVaK~xhPS8Mi@##v6wA+)T?vm+
zV(lfboD$Hi&5depeoK}~ktwgpEqHjW_*Cq5U0m3g=HT%Luc3~zQ9nf8U(0fZXB2(E
zP<wSTIfbR@xa`CY1&U|h!P^t2+;=CiLJW~oHB8fYt2%U)F}#;Y^Szl1NU8oppPK~l
zlT>AID1~!*7_7cjdKY?OHpQXp7&o~JyS!uWeyROBYPw~2`$kmlBa-P%+00+shpB;S
zRaLFnRHdGkAK?!T9sY%)@Z*+2qQ|Q%hMi=3DaNQ;pEwQe?@L&z1|gAz9)=w`?03v1
zd<}2KV}}1$3A9NeV_UOcd?p~QjdI~pomfzT`ftHgrb@;OHed_r?PJ`uv9aXzQI@Dc
zH-snr_$?&c)YZr<a_qY!efPQP=iiyM%IL?nSh5B@uo8@}*<`_DlW8eGh@?Jp1f=mK
zQUoNE24$-2nBvD#)KV0?T%(Y(U~!NYq`^#k38o{ci%lW#Njx9oDk4zOz@{Ugz!zNh
zM>-yO-ugp34{?)&^UV*3A#0j)B#YO3ziwXs`T0P82PboQr0lI{&@Z7h8NSrAIqjn&
z>Aa}>a6TN@Pi>_hOQ`Zo6Kh3~dF86EH&HHS(cN^m{)CPIN^~q>L^N&(e-|lL=&iJF
zXpV_bLf_~zXP*3MTxw8cP!>g;hs(kHUt9Z*aj|&uJ4)pmDFfvJA<i}Av{U7Mf0qAD
zymjeVs3kCa9#`MKQOW8Gx?1z-9S~;or%<IZjCC;ktKM_t!(Z~<1)4?N3rDpAzi!o)
zPnq{6VI!5T03~~k^m+o(il5e;Ms#rYhJE`?Aydeo?aI|XQ^W5LH@XoEq<X0XOYFJ6
zmNr<g5b<0We(67gWB#9F0+wJK74z-<0sQA>XT%3wnNRN=Ek6$w(|WwLu3m~llq`-2
zKRF}b$uG{Htl0@#%$yIDIZQl#<JdUnpto#{sx#zuCUkkd|C>~LNx&QT$^1Ov_rZ*{
z9qxW2Y&~I_6agmMT+jR)mcGC!#UR3uD!2pq#Bqs{E%#p>-izS(k}486qS*ba-<<dD
zij`n9WsHc^5$&8~+UX1<WZk~dLWb!UkjYP%_|JP}_T44bQ*+g2`PC4)3A$r@_q;^l
z6SQ+Nmi0|9&ogj(^i@w?N2Irw6V4cAcd_xksFU!IqqqN=1g)Qc>mr!v5*oxE&__`|
zl1y$IVzxJ)A0L({nkwg^qH{TSbq0-@vLsD}H#;Vr(ehS1TX!r{hg|EP>{hVuf^6FL
zHtFD}_niIX{v-XR=l=IW2p{TU&RFT<YT5)SBU}3k-^gev4fF#T*-a-+T%u7K4O`4N
z;a%XSi1}-8ulvT%I4dV=Jr0YnMfu|>i<(LmM5V)Wut_J)x};Xzf2Xf5P||L6xua*I
z9ew$Cm8%H-8=wQEnd3sV;>q$@A;0GyWaq}8B3G0D84RV)^>FhML<Ve-u=9^Uv<87;
z=qjUcYhV$W4e9;g!yuRj3SQrg!7>aC_9VA>(_P!RUtEWk#XpWeA~Md!SMMzvZT`p(
zjmqJ@*xk2Uo|<Uz1cx7z;e4nY*?2dytJJ`386k+~%%CA%vsl4T8^#zmIXoL1lfrLp
zO%ak&`8@=`nUK2l2W7j;H>hLZHzzgB?A^|(hkRLyXKBS4kq%zVr{Pvy>9*u0sf~_T
zr{t-@i-ah-Ip1*}89`eN|DU$_8Hy~{!Y3qTP+nnkqzIbv#>VJI5XTS}ZUG+seR00v
zsElK!xx^`@yQ~3-T)Wh0c5+;DCqKdW@PTiSV246xwvxhBf*jG0Q>i9P(7l1ebn=f;
z=K5<5UEP90F%FB_7Z7TCn8=dUrnTKN2-xtL2TuC6yC1#hiHb=eie(`%JjynRi+CzC
z+Eq0sx%(6Em!;2T#Vdy)ZzZTJY%VDx_fmG`vgzQS{LZ#^{ZXY`?x5kd*?Ph0MpdQz
zw^i-;|7{~MQh%_}azx)U5~sH~=ifMsqFJC!4qHX}k<!L}@%HVKGITUqtq;f#-pabP
z(y__sg;At`UQuj}2B!-M)zqe8wX-e<B6a={P^61xt0ZiE0r~y=^fobU7qY&xx!0k#
z((~oJ4B0D){k2xS6kUvz$~TXAF$o)9ygP>)vH07>cJfwY1+4Vx+ybF-ahumsqU&2&
zz63W=DZgcqWFWQNL!=$q`aBc&#@uK2|9<Q+01Ed9E}OkaG-SJl*bm0|{QcurI=JLl
zP&U%DvHuPW%w%gBz5UYZdOhTl(J#k^76445mXeik!&3y=lm0>iHT?rdN11NcZFh5F
z-1%Yx2D)COip}ax3GG&%*=x>kun*+??x61gRp|LV0+@?(L$F6!&D5mef(?I@XnPYf
zQd9YR-#QzsX?)5`iQcc~!M?6Mq5wiP1efNb2J&0~PbaNRmCp;JDWn?Hvc3;f^71f6
zm8J}=MX`0t+J!`(y-s?`6t-rlNQ*aS^Zp$}HVf9PIoy~aTVHLct~fpT_pAD>iRt39
z+JC2oss3hbyBrmVP`8WlMt88LcFvLH7+;xL$49Cu>(cz%86#5G7L`+u`TB7XbS223
zEBWfu(&Lrn;0$)t#|_RIZA!t&y5?&phFhP>W2C$<mz2p@PHVSNb;FKR5IgVY{->9C
zchdDy`NP;)bxQ5yepPsa&Fx!B6>OdL&CGrY7f$ov?4Eb<g2?I8kh7pc$pcNUMf;VM
zAH$zm#uvH@BV{ByiJBd*XFZ|1CF`6Cr|S%lXDJ+g`oeHc!yNZqXE}~qOdu@vuO6R~
zmozJ(z-iY<x$~i){M5+3%-D#oN!3OH9PKPs=PUKY&pd~5D{z$7c2-<huhx<UXQew;
zFpPW@UO8~6%)i%$?LN7hxf!;~etYuB1^O|x*I6O|VoUVYfvB3z+~g(4+l7D1fT#aD
zPD&8*i1VXrq%zLBDYwZ<xZhX+7x#Eoa&4o0H_l4af!`GM22M;$GQZPYou)KuvM@2{
z{q|rtL{3XyHp!IL4E^tQ9yH+|{7tniGNWHCPYtUs$AGBPpE!C&f4CCC%BQ}EOQW#!
zy}YLrfx-4!{pu1#U*)TcE#d!m$ASXxg#2Cue+x+|3pRqwpVsv7sOV_>`{};@-_1Rh
z?Y^OrRsTtud+zmYjmbQvT~$%uXZ>1vH#w2Szv@VxWOojTI};ZZ4O*a`cak9<EqWeZ
zXp_lQb+p-pOb$d)A)!6Z;hvjTv>3r9CBY{AW@VoAKc`O~vO5h4(?4J?&8!LX2VU%G
zXwkt=GDeQPJZTuULy#LLD#|W!-I!F$#ouFQBsMn9Qo}8e{ugouqq(i4lXS1E$NJEu
z$ES}ISE{ZGFr-pYjcGKy6^C+lA)z)McJRn<?WvwY`EbO&ym20mQBz=FxCHzEY2i_P
zgk2tnjP!kD%DfB&8FMFP1Ew_*m1|Lz*x{JxZUo4Ut+PsMwD+h>*oksW7BIMavWq%z
z8ipHWHQfz+YghWOVnebm6A3EWOj+9Mnm&_j?|no}*5KC0*jCY^!pSd4xR#8#+9q6_
zBEHXmT^j(~al|z_sYNg!L5H5bwO*52NyciCW?;_#o~V}opO$q2w5(f8uhc|^zP-Pr
zr%53DLw!eJ$Zl{Cyex9F`)Z80tkZikA;Wn1V;Q3CbxAjsAmg>sE2-MEH~s4l^!axO
zGxNvN;os@46BZ}}EisHxxy=pfyGMc$HPNvDyqSQ3Ae8!X^gtoxt7LS@Z%ydI!h(;D
zwn&UL*v=`9`I`MyCz)>8AP=*b(QGzNXNuOm+jY8WkY3$u{nRgJ#ZGL|wQsj}MG%g6
z4u)yb@noa`F|=ED)@8l3;fkLMbp$7WUar^1WU28Y=tizc_J0N%OCrH(>~{5|P5Ypx
zxWmB(d$+6gR@jRPHIjz<&Fcxx6U?s{BaV7VFwfol^P|YM+v#BO?Fc=SejlYUffbD%
zN)xhrQilY&f&NT`-@69ox_QjAst7MT=7F6e)3>>?X~UaMX~;o2^%v)Zm}1hO^)wj9
z+$=1X(|937<Uu4T;~Oy4I7uSB6~~BVm^bGNq{y*MP>`2Q&VyVvn*VkgZb(j6q<G1!
zF}@~mk*$r)`R*S;hfdL;<4)fwxP3+KqSmGOSN+UzWtdYHtlHZ0b(L|6=p+dfwlwFf
z`{TvSw_TGDZWp)ZdJ^H6Nfk4dK9iL`;o9U=P^~z}Jg*Ehrr$Cfn~%jEok|qXUxBfp
zXU(OmA_+1+M@rG`7r)~BB0@h>>~~vo`ObER-_}&v-rkm;EUQH<8%FomA#c^i03$Ls
zvB_{bx=uJ_q5{&{(NkdpfSX=tJ<s$>U{lntkoRl0-o(1ENllUVV7x_%+Bp&55BKid
zh0~RzI;W$wh}_Dz>BtdL1LOoQ7h7h|JIiAEUo{3Taxm`+we0uzeduWdst(5otnE^(
zKJxd4^qty=NTKYE*C<&`^I?e!dJuIp5~`|_>C=nOye98|H`U}8eaMvsk!5}8a)mN@
znt{vX15KLt!+&Zycy^lMJ%*KpqUaLgKk$4r)F-Y!g)25@gR65q?#w0g^@rtO-!s@w
zKe!NTNouPMEOP&_m%=DCG=aT#Q}v2mJHMdsa?vE%HGx-AzR`L!%Huoo`mDKBnJg1j
zxfLF&*xt8i@KDBQUgUaO-_~*BPNbrSNu#k1Xxxt)(!`?G6eks=i5QGqkl&hhndXz_
z6TI{DVu=9DEc1V1j-q_hpw!xxp&PjVZ~*>1op#OeV?N37)ZbjK54OM!BC6xp^)i>Y
zJgq+t0diHf1y&uE$<r=o;3VOkt@XrM19P1GeChh}K*p7Lq$h6Sz2)Ib7`6OS3y+~?
z>041flUcoR@5bWD%TiP5J?`-gT7q*MLX`N*$0^w}C+2zKhra=s8hiw6ILMeWLEe`l
z`6&N$dxLiVN%!R{5i!AsRP~Ua^?@mrpw)mEzI3Sx23QgOq<sv!cmZTQk9=#Mad`z!
zGKBNlMX$&r`!+&E?`jnH$(#&$6?G63>k_reo;W_^<#X66NPWyYOd^vJov6LaLMdK+
zULa_|gR8okaWTZWE)t!HdiJVpkA2o)_Up}Vf*Gqj)!8e$kIm9dRJtVC2XpR~iS@A>
zOF86j5q4L}^V}OEyF9r&N>gFN_Q!T|ZdUUp6~?y=vfi;}_h0_sNhcNdk^$$vK|m)#
zJU)!XO-~lqb_9>s>7_s1$fLAqY`DYK9+&5G25gu^H`8(8Dl#B~{I@+Ux7+9gF8tUi
z@4b&kUWCiA+u_SX4{rW|5y)SH^ZA^Ok(ZjP3p_hFr2n}f5T22`w~egIRFPqagtVD7
zkZ#q%LRq5QxxC+}OUCPiM8r?`hXo>-jbOhEKYsm?5PnXuvWh!$bW(&%Iy7c%o$6tB
zbyERX;&3q0I^Fiiu(T=M=kou5dPyzD_?^c<)N(mJsgzJpz!T8y9Vg7w=)BAToUarb
zh5w}${IfWsS1m+>hbKNVm1G@oCPBjzZ~ftK^teD|rtHez*!X%H-W`*Wap~)4(be%7
zdx~XSj3_3Q^NWRMOU?iN<!tinLVgHWE!{Wn``+B;_yQ)uPi~2vTO~Lq3o8S&xhC21
zAKB7TMm*=`_SWmhM-xI{Y?lMJiQLsFj=U;wYZm?=vfeQ~&vx4yZqUZI8{2kc+iqjq
zZ0w{#W2dog+qP}n@6~7Rwf9=z_xJvBALn&)&N0TEBhh&PKZF-4H!t_<c6_9N47)7)
z-zmzWwkZ}GvnQwQcS0$#PjL|Rkg=}MCX&Pc%99vV@voE3JI?3z1_wRd>mmSSvxgp@
z<qE{&|K#NWKqloW*xfrRzu(}x38;MW5%&!3Qv@VlcZ62$tAxpZD-H`#PP&3bOZ}{a
z)coyk4vSqZ%G=G=+*P#yKVdFf7`yD|)6(aXd`lUxwEW`iZI-N(428FVOT7gAWFtfC
zN+x@y=NlI(go)z~OVqHeJDgc+N;(`iFZq^p)hq%HjJmhsDjY#4-#UX6!Y!|R8aej|
zDv#;mDo77{ycW;f<>I2ya=v?C1wKVaoX>cT?1!cdu(^iHs{wb<GPhp>?~8A`=CWKr
z;Qk|g#+=iJipmC$VvfK~0vZ<l)T0!MqThpz{eg))@XhX$T3uY>cHq02<##q=#zyc2
z8H#zTjN618*R-V-SwI|dcd}Y6Od_BlEXuf*8q5CS?LR>Za1~*~2+sIt)&re|`+7h(
zVsJ$zK)wQxcu|0b;Jl`FbLOb8jb?49%P6o(l1W`)>L)eaYYB@KEA$gic~<$E4QtAz
zG`OV$MK4fj1{To_cYu=z*|bJk5=!}rDLH}!Y7$KXjR<ZSbk5MQ7FPu8878iEu`WWJ
zD#4rUp%}d<?4b&$<_rCQ<GG4hR{&trP<ShA9@L(n66h-{kWBIB^<zX7jzKviLrYvq
zy65@0NaKm?*v|y(qKRawI)*NCkP#ehq;Q4O#uYcG$(Jr#>dzi(af$6rHH-wZMUK|Q
z_eGxXYctQBhi1@#Qi1iUr}lJV>8aTB{0V|G93hRH$)Pb^MTrtP^d*1a4D-JS*v`Su
z<97c&q2x-$txs<wOhpgYkvZ%#ErjScKPOr-;9MB#&RuYWZD;j0?MvhX6*ZFAY5QVb
z*ehn(w+x??ncZP8sz1-+2%_0O8;$n>N?)GQ%Ic!d^WV(M@a|RnEeETiPHa*8{0uwi
zyR20kcWrHEz59^-gzjcvyvjPB*1hGzs#$;Yd6|quxRgzYC%4zn8N&-}`s($Ufpf(z
z`wff)O;)o?O4lqWb($z7drkwh!h^M{zMmp10iM>Y;`D%;Cv4@QbnB+kP{2;(fU-ne
zGnE$moG`}gBkq6|hGHE^l^8~)<aEtoR2*4nk4vDuogKVEfmr|?77u{dkv`7UUwss6
zdb!0&;wbxJwQ*;*k^X}a6>r_k^hBZmsG8vM(fS+7(ZxB2sUOX?yPcMHc2bsnxF&e}
z7(Gt=9~hr;aArxJ5HAquXmMq9GPZ=M0P7M6>zLH?w6N47%csukHr38t!Q?Q}C%5>^
z(e64S_w&m^16D)r58k`pP=7;~vXWf0ZFZB!nzK=XaZ>%zYGYI*e-Ami*Sp>`a<#;W
z0}8UVk5hj=8DJ@AtvikAaCxM(Wb$I_^IPhkYTMzdw8tl`mFocF6&Ha$dc;Q;Rs&OA
zm8f{wdvP`HgYN8)J#<s`L*j>(+^N#~DOZXyD*K|v@MV@HBFWY1c*!3pa`OaN1fJi{
z7j2NR@XaiUmbL7FxbH7N18@?6N)!m-bnN)We)mcmN;j%_33Wx|mU(Nna)Q5KI;pNy
zK5F<kTR=W)$!;Xgwxm}0J0&YG^Lh4#M?7GlM_QN|Uy{%c*gY?op?IaOB(1q7%v<Ys
z=k@Ib^_^SW{4m_;CV3ys^L8#Xjy|7r?gbr4kV%NRr=vxJn*IE()qnMj?4gbxU1GST
zCG&aQS--ohNXVnshAcsm|7B5)KpxPF0!8aUEnNjM2<ZFHPXu(nXp`@j+jX`pFWu9(
zuYQU#rE!5%QK5WrtaIMZC&dT=K-<XG!aL&G`0?KF=huB)@;f~<N)c5=2{v&gCYOLL
z2Ml9Cyp^MYgTLU@p^KLnZLHKVe`dZe_g1kjJ$@5k*@s^W={g>K+as|S_;_@x?=h5I
zoOvPAzcQg1Y~pp3^>_|ESKn*frjTcY%D^)C4~X;Em2Da1$C+E)mC!4;{Gg2bv0#o`
z#`%SO(%GOmHJ_(9V{_wk29uROUtPJGYTSadB|f)Gf2*56H4PpU8}7txBn&@%Ld97s
zYvz6eUVfvIornGIr}_i)GKpW@(Go)gkBddyYslSKg72&7>FAVHA<*yB+pdg^=`P)1
zg-&GY7{S9HqK$#d3ZbCa&0LTo=mwiLOM|t0FNLLwhRktp*ZrfF(|6}G6rnHnOQWBi
z02qBw`R_d~yUF1T0+MfN`5<fyso77~4nE>btHc!c6{-?<ktvKQK3>&(>WeoiB(69c
zMc-tEy+I{N7|2O}-2B@xiY@P3x*<WwVxPqJ-A-+wfhxuxn%|kGOLc@gcT_Qs-pl-C
z;l56+qID~VFNj}>@7E=~f97%a6A-8A4N}y}Nh%zm1QH!P`^>Q2mB_VrQ3#I@ij^%s
z+*--H61;RPB1s^kf8knpXdn@TqeholHMUyin;IxjRwxKj@o(0=tFqwqwo+sUOQ-z-
zrV8Xmr;|?-Ud>0Qh3$X_*Fw1h-P-{mf5;-PjQg}?zH{mmW=fSp9j}zez!xTWR?U|3
zq*m@xi+UiOSu5p>-zvKN+<r6AsHbtB{&P7H{V-yv%&K>U*$;nn)WJ_G;71+YDFCRT
zG}50%LrJa{Dr#2O`~d^Uv%|we?MI#+uV}Z_%(jsx4jan@<2gp_cly6_nc)z2`Sa6;
zCdvmzPVH(p2#2rZ4;8%T))wXYMJe~UG7<{Oflf0R4@#^jC3(CP0OU@~m2~yU6VVIf
z5T3Cv9ivdaUvHdpH@ESp)qo<uuu_>U;RtIk^Q@t?y?s6ls!h|m_S1Rfj=F-bU6P41
zgc=S;X8Gpw=m=G1y^Vqxog&5he{yjE*Vm(fy12Ftb|AOmwVsu+Y)Npyuh-u4a(&kQ
z1HTY=?ec{{<4MFD-U=_jg4IU7Lz$+Il{syGLqX{`0#{#T$A{hHFc{WYfV<O?fel*S
zp1J|=8R1%~uws}PvdKlk=<54PfUZvKiUue<5!CB(+^o*Q#psO_$|a}c8s<YhSl`P4
zG6n_#K7K!g9HOx5S+s>D42;y=Tpd!td!sNZ^G#oy2YqerEKuJN`ZY~Xb<TZobaVj@
zyD^>DqaLUd+LP*G!wZ#bu5?wO7-~E7afT^@`WG=n672d!VKtSc^6htx?t$sa0!$<x
zXTF8-g6<iyYBYj#GCZz_#562@?x!}<<`j4H2HCZfLo;rxH`k)16dX)FZS~=uSdq`9
z;qasC4CZ~|FBOh(CorrkYo2&Jg7ewn^A73{3J#tkr1}{@JzU?0bgqJ15DwGgVze~n
zmrc@4v|dzm6t5=u#OFXzFC_m$6u#ez|4I^u|Dg29qb4zkOl^LyG&PdFK<@Kj@}vum
z>y4Y0E;zDqk-0xxE@tdu_^S?t4I=`*=i~~=#=KOb$y0XwR)94%K<U}pCV`A0MS=Au
zAo$bch6B9vfy=&O3yAwa)PK6lrD2jKZj^|(N`U`B1%cUv#_C6-??jfal79kKqEYT=
zRc`mViWZR<q1<B&(f1ILpLC>43<(cU7=b~{PHPNC_+WFh2izkyMMa|uObVP%j4A6&
ziEZqeXlcg=3lB`T@qBLe7yEaAWuzxUxWApEtD^i(8{}I9vHiX$Vih0z?G5O&(XQ5*
zjhCk*sCKx;Fp__rpPoN`dYe5u5n6U>xM#VoJDH->?^|Sspc})ibzKV-VIN=R&2W8|
zAI_&9_(+;|5$Ha0Bd^bB_ImG{?x_*SAKF5m@~#4efy5>VAn7q_F(t(%G;lgDw~0z%
zf;`-f%dMV|TxM&boGnDym`BsfXCKb6VJ@Wq5nsPv2ZW*sCM%8j_7HBke6yutrmiq>
z?H(U$$158{GFwX_dhY?e_&frfZEu3M--HU&TyLyb8=YPsjew#%@3%BZ$67W^j@gEa
zxD4GMpwtYoD7`(N+ndaG7)wb`<9iEMrn<SiGoAc~z~O9ueDd}p7uts`H!*Ittp4`?
z!kLm_Vsa4HrzkeyOh>n4iwkr&7Oc>Xg6<-dOK+B*i72s{2!u%l?3OlK8WO#})J286
zT<2Bl!3W=;A|v$SggtC@F(_-3VHxoE)z(Q!wHT8(WIExmmQ4V50IXf7gi>;6EgUv-
zh8~Iao*)esN8%SqbV6hgGG4KKU9?n=OCbO>*WsSTMlt$wxr{Q#ofMsXLcsoG?Iof$
zX$>g^_qRk*(vPahQCQGiPt5xkWAMheR-_STJ(CI^Rym@C;sq3U90Zp;hs)GeHKJIi
zI^MD`|Gl#p8yhb}Q9K%IvfA0-nZt7Qp`4C&!_u3w6PhLQS~cE;Hthzz)bQakIcz7+
z*M8G~?t8IN7HoBskRTt3*5fef4++Qkkd_*emW?`Fav%~w3`U^0+ThEQ!YJ7rAqOh9
z#H6IG7Ax_Au5V-#+x?MNSXfKxTyO+jwGd(Y-#TnI)){R#XKPGs;IX!;P>a8S-=5A{
z<VcfgRq2N{H24d)f@p)soUc51NRQGB2)y-2U`2fWN8BArAMT=kofrD8wXL}q;h!AZ
zrt(L>onw+!(l?ntA4#7YHksQ<CsLxTqc^qta-N*>A~G}ybb{ofypp8?iSHE()*wm{
z+AnAK;0qcSbTpFhdkr@b^l~8tbBoiB5+kBaP;E4>GUF?)oGs)4YBO@kDU_I8%Yu!?
zD%>r*EcgVvo)ORkNCC@vTM%C*hbVkfCp(k*?$_}gm~WI>m3<8F$3O7@o5KPsBH5fR
z<$p*NF|{2%fDGG92udmdO<{3F>D2Zd3glAROeeGcvRQAeRLC~*`DWZLKkDlfsMW}T
zGIv(aC!mE}%ghY(*_jv?_VF}}!{H<{Z`-rg@*^z%DW9ADA`m;PYr;3$5i?RAttK=3
zXg5^@y)Js<$mxAvUG>z}S7!<;CNkxO5s~~%n_3Wx^z@h+83pJjK3p}nY}Q*H?4YRV
z=psWPA>r?zI9%H3KJ~qT{k<mu4oWW2<NH}?8<YfoyR0WGKCOYKvn5-X*L*Ep7cIAD
zfV;5M(G;P+%C7=nJt35}_1v|Ga%t5W)bVBH`|a}*H)7RAXB(sGQ3m#wNsGpt{G>Ik
zifVMc6!>1PafXt7n<nq=dw_X&E}yrLhx64{l*c~nkk*kJE0(%RHMvMc58aLu;cdSo
zs-9SN^T=1PXRqi(8qfYu+DFKoN4~W~RAf8S#K@?@>SUqOPDpBm41u6MA`IHm9+a4B
zb--lxe61s$uMO}?(;yqMUZDFvn#7kY2C=`NZ)U;mw+A?mxLt0FJe=d%thr+%Zn8R`
zg8O`IJw1IM8d^alArJ_#wtna0y8G!$EZ#M6%n=sc33S~{Kbbb#&GfNus5Z|wRoD3x
z_5cu4f0@W!im<rH<hDdQ61|-(H{Ku@a?aksKNfsjUbS~D^aL$eo6m8;dlX!YU65B&
zR7^K%Q@Y|}i7@tT&I59>Jhakb%}<Rc)pB=Ycro`rU6uNI>GOFz{I?oA*a_F)P27A#
zqs<Z*3g_EQ&f8D|mA;b?BqRlVZU(ERKRUk3S$UNK!p<nm*Rln1c3vT?=6hL=;Q!<y
z;fk&R1_v9ccqaQT$>jvCtguzC&bTx?D;{$m9ucbH^Y#qp_4ZsKxAC|WO?!82rmls6
z#n)<LYdbg;7reIWm&v#GF+xAIQjfP%O*%YYjut&Y3|W^gMBe6;>2s={(_I@#NB4er
zTy(LH7yZ@J&z-?sPL|Kc$Oxz-emtCV=(D=9Vq+@<kgqF1pW(xsju{3@yX$A20{4Az
za4!N-+xHpii(Y|}+ios;rA8fM>@wO4(if?JkxtE57uvrRhwb?W5+p!DDPj+W+J}lV
zq5&x%PtwVCM2<ym2fA~~!6E{<8dch*<k;bY)%h#dEj9x(hu{Cliu}SxK)t9?0Ccj^
z_;0if4Ea`CFji;o`T0RQEt=e%kxlMS;bm6g#$(S($*f$S9OKLGmCBNESljj1YXUAV
zEvy#C9iC6s9uJIMT$;36Y@c&)00($jd>&R_-jnS<FG$6ZuYMsRpQRE=$VjB2;qglg
zv7<vm)|zhAU|@1r8Zv=iK5iokrm#JLJeOi!91s29Xb9X@IU<}@oo-~1d!+qC>TE-)
zu=1U1w$pwJO)%Pp0lP7ru3}B^%6{jK))J9zeKSt+`}+S!EPaDTx(1(Lcp1us<YbYR
z%!+C=C<e@O<gf_q0iWCTX?I{MC`boj$g~LtnCRBZrk65A0FA_2eLezf9JsZ#I>ysi
zSZu7P3u7{J4Y=tNA%wYzcg~(}8epKq@Nkzk)z$gDU%!#WYHBqo&ju#M$CHuknJ>?F
z&d)0<uvXGMZ})Ngyg!W&iciedfkl%)CMA^q`#$*>+Tp!j_t@RKeX$NTM0l>}q1y3-
z7tf0}rTnd^<DJ0PEsdJorVqS#f#xd1hx){*3e{7;p>P1kqD<+(cnVEbjcAmjJ6C|G
z;9HpalZOs`Cvg0HBBQsxI*{F3r=pu$y5zQZ?m5&&6#DAxg$5!|*!6oEL^fHD4dU$u
z6(o}nQ=cC#o<>!qs{&MCmHem@D6;v_|3rPuFX*tIg$!dyz34dnZ)76%EkxD3Fqh!{
zqvL;q6+ooZ5TZXV^#?piF=AC*+}`fhqyh3;4<zhi;nOxA5)xTln=s<hgX~wMOxm+q
zv&qLRFreKD+BV!1e8|Ja%<+N?pker5lmM6C<`E&8=)K~b#xFS(+n*=6J&bH)pL7nU
z?SVqKr?o_h6S;Yot8##+{WlN*=j#c8czOnXVQ7Segy!Z{sC}LrzsoVRa`W<t7^Kw{
zwwO0k58I=9_6_@g#UeAJ0^*ndOt!nb%}WzhtTjb_C&1|vd%jo+bt2p|(Noa~U^RUc
z>deP2kwS34d0YhqKyoJ?z$Mr4e{tHMejqEbpwY_!oF0MRcL<n9YI}m@kZC`jFk0mI
z&)-jXKKICrd}C!BbW4BU?XoMatR!K)!u)3lf7$vRNVcdQjz-O&wEp2in~0mA8!_!b
zAuzB=bD@wSM!NbwXQ7rL_MKVSE)j*nVWucpk6Qa?vxtzFef%chbbI6;Cb3NWY^xF`
z5}V8yzP#9QoxDjamejnv#>!gkcK3hUV(~}_U_F4iIg^JoB1e}pqpN(JT1NvTZ*WaV
zmp<yNVi*+>B&fhFA0Mu+*jE^6T{CkofD;3>7e>7}<W+#kpAtcl6F0}r!$FwN$MwyO
z8AmBQ-~72IA~r|yhD`?0F3p!#B)k(Tk;SlDnDigmF#rlhYB&3rKxs*hdS;~=MXv{@
zaO0%dSD?wua%bf`O-Eky7fpY`Hd^i^f2*Hve|2PANxy*~UJGEq?*h6PU@!j@vjq_`
z1&x8I<H@ZHef^aQen=&u9W~5|`hxMdw&7oI-(F4%ao9K0;Vh`xflc$<0r}m+ZPxdp
ziamHed>R=nO1KwH3%JguS~C|32{Z9Q3UCEc%YMKpw!3FXF0C;lXRuU**x&DqSHzww
z2B2x+|C|0o_oeP4(i7y-LTi>;WTmHPl&yWEW^vw4Dv!}|cM8$QL)~WbFX`5?Hw{Xh
zs{GQ=g?%??o!(?}V8(2u6CzEM`melfFP1bg59UEmkHNj~=B3E}%W~)9|35ABBT>x!
zLyO(AJfE%786j|rLh7{JtPBhTP+gYuxIi1_F@e7{yZKx}m&niVeYA@^B^1~5L*qj%
zwtfiSP;lz!$rd7goO81QAmhL8cFrp@;hbrYhm4}aORajA&#AnwmpN`X{h0k@6BD!)
z6l{!)(sDip6?z{_H4FMxr7}3-*SFsv&Uq8X|IlhCT|ZGZH8p+L>+1u2#EIlvWm3D2
zkGt`>^<iP-!8F#x(6oR&p#HnSV3E=SF;T)sQ_V<sj#>H;#h3qyoc=q*s;615VRT;@
zhbUQZb>$}ix?bJ(bi++wU=4Y`xYj>2xlk_F5`t#%Hz}~VjCI1m*x+&?cN;nSlbBkb
z)(Al0>%~eA(c~HJUgurY#5Lk{Lt9Ayn_*>RiH;8{^s(GF^%&em-ht+R5xN2@K3<sr
z@@p{>vPguruO<F=nsg>Mp+4Tp*(6lt>KY%{XUq(<F+)=mE%vB;Rf!jg;e$w`k|S8V
zHHzOYI3Wf)=J}e^&>1&{wrtR<^lEc4ukJ384PkF4#FiOQ_dHA7?4dqKq1<cS*b2*5
zBMf*pp%NG5jUL!h>_QHU>~Tc-?A3&WK%_mt9{S4h1@-WK+}FMQwr6}%UOgxX{kBJP
zKlf=OE8v!Z90L~&K1ly?AsSysDKnF--b=fFR84XSi}CR>!N>Ww(@anT_AN!w4>Z;X
z{6v98E-G8^C3kmINBCpsEQ~N>TjzE!tkW)T@W_Xy!Y@~hxHdxc!|k>)%%rGY8;K~<
ztIDk~4dUHrZ+93?4B<)a26bU`A!R^5%tgWN<`2uXC}3R-K)9@bpAMjr4`R2Y-iC&`
z`D<xutMi#A(Db@qsQ_pm%+=p3eR-EIJu{lZs&sY=DkFW{g>Sv;y4Yazz-o~%MM9*v
zfyK%Bc{s~rAgb(e&Z2u~pePdi=-M?cD+}cPy(TP7GB-z5CX-LmP73$8WFEM{_oU~Q
z2KtCd58amfI;)k0%s&JrfE?ic)4xbc%%HFl5`)tM7Avdi<?^S{GZXDEPG`*J!Nn=V
zrG()Vr!BZO+|{`jIPE_-799UPKqeB#z2x&rWNxCPVY7pl0=&{b%D$KBTBtFbvBAO1
zfQ<SFA2GeL3pkOOks_6`x+O8Qe<uaOmk9U0)61y*y&F(~D?b#vqSJ%_*XD`kWUtYU
zpBy{3JEGSSV<D~>Ak@Yw0u^{xX81Y3K1k`I{Pl;?76F(W)YzhFiMrVp-*2t2#6-!j
zmc~OKNP!<hp}z4Ta?aBj^B9|%L8c&iOd_+%?0v8&Fqw&<1W@jv(|Os&`oph!pSat8
zYdN#4jlRWJN>TR9%H<{Z^QVXn$+b}q#VLNUIA1ozQ5+i==67`+b|kv%(6pp!fTl#~
z>zQU`Q!Ul`Qbl>s^X$6UA1bs52zh1`DMx9V-DocCNPKwX4jYb!-o`S8^|#OR*gikw
zl$89!ypd4pG!vj*a1aig#0JLV6O#RNrp$M9CU<~Obs#ITT1pQ?>ccH%ksevTHWz+F
z(4YF{j~hqWzEt7Hdc62j1=*wzYAg(7sfSQD_JPe9n7hwG+vCK8KI}(#3nXN574|?~
zc5%r2NL!c3&sb)YjRd1mqmNhfPp#pdU3Cj+EQ))cR1H~f8b-H)rTSH;Fk@hT#eJl>
z*wEjIlRMJ2Pk*!@8rLU1D&BOq%u4%Pl=OUzwsyVjem7T$xg9BhvRp08o|F%KylsO>
zNz@?Yj#CgN9>rMSjnGaE_gCZreXDib#BNPia%R^Xed@-Lwzoy5Wo?>YZoM{uSSJjk
z;y@5m0B%Yypscz=A;z_4qtwIw{coEwQ_UkV*XgzD2hsdZ@c1Gk_;}KgpI{n(9JYRq
zw$xNqVbJHkC=cs6c3(p%sKTn1@7E`jZ>6-hp0Z8vC|if$_|{F_n^yD8;#MmXUp8PZ
zkAODl!SGg4Fe?e^URzf?iL$aa<Z{j{4Hw=15FVpr9ynN$SR9iqWMtdRbyADP>^8C+
z9hud}GL}DnytcR``+t#fzdt<$x_bj2)RB7!_CL(b>b82?jEqR-(n5T*g6NBT-h0Vq
zQXa02oE4`*aHA&8g51Ea-eD0h(A(+PTQeW7Dm#{N?IemP1QEbccvY-OzocXN++PjR
zCjK$((~y?k?IW$0WU_8X(IH=X35=9!zw@kLn@)dN(BZSC-qPMa9gB1FFh9PV?zgv2
zX{u?hG@lD~Q%3<eA|82TJGwhXZtM=1Fo7-x-G=LZ3I2429*bu~l~0Z;C>_tv-??Dh
zhz&g}O%Dd0?KGe{>F=jQnK#Mal7n@a<6CU;TvRVfXQ8i=J~2dV?bnVhyRISKy+M#?
zkV`dXKDPGizT&`{bm*B|0KbJ@EM*um<==jspAjS9_$I+9{G^IO1QuF*!L*=WrDAV}
zvU2FCYR37t(jwMx5^-WqA!Sv!Eo@wO5Glt~_B4O4VJ_9*a_H%G+iJ7dmzN+|YvbhZ
zU^fC$Sk7S4=8**D67C^pv-@r1bVoR=m6DgWSE4Mop_K}uDgEP2hGX)p*U@9la#?9?
zm#DV2ck^3&epy&O>`c;{R<_FZ3AFpgjE4M7gEK;)S?nK^^X4@}in4hcZ_U>GA5#q-
zB9`a!Ldo3YTxGE*NrRo?ob)$1?QI^~kMTA0ro73jLr{TtLgk*qIEx#LVLfxo`Fyrh
zXG9X#b!4v^M@*-uraOmENrMFVIUruZVnHwuQ!PKn_FM+WEdOmFTNX-hj%!Z`qwjMf
zHNK5)8p4lVL@Q_Uwl(e=-@#jjO5_Fn9%zzGX?a%h<t^UC=QE<s&q1)>7FutT^+&+D
zs@==W(rPR{-Kx9iAp)D#@M`;LF|5kKA9&)uwuO5atIBa@%aW5fYR%5$$UZA8dn(IU
zw)yxxl<Szlz>Ek9wXxao<>%!&UvAbzq2wnfs_C`4gZr5iijQ`6$?57o55}0jOM&oA
zY*cQ215BPA4%m{C?|i=+ixFfygM~bp{B&)&>=7F)Vi`_OSCj42#@b>JpprGJ;t+jn
z8LQVquQw5`@LuyjON4W*OT%CulUUOd(sEM%^;Tr1)=S!(fCoP=JrbEzyQbe*rOF;T
zJdJJQv$8lx9%e7Kkf|3D%I=rL-2+dyA)8v?zo73*5JGT<yOlN4wqS8sB%`wf;a39n
z7#-g5GA2h)b+o86Au95^iFoH+h!oQBs}k}2MRR`{V^2T8hGk<3QcWkVvNf9MsHCpI
zzPRFm)zi%i_2(M*iF!z4dwh6;a*0YEjp513aGEAM(nqAM0yKWcjFhZg%IgO{fG75?
z-Zmo#wj+It3bcnK4TX83@I+$>^I&SZ?=IkC>}F3ZiiwnEiLY0&v3gvz7R&#A!1}#q
z!JXRj!PIqiyB(T}MZZLa(n#h7D3r81%?QOc(nQc;YQH<j^HlSy$r4Blv2nUXzRhqq
zmEd$VYxDRGE9+{m0SEJD9Lxi;XI-xtlZU8T7Divnf7>4P^J6|ZJjZLVh}mPX?ik#d
z@UT6ixWMSv#wt(+T;`C9RlIEO+ztkt<78$SXOZGvOXuVomFIq|IHXQ>V1(dA)~Aqk
ztG&v@5new}JatOyy~L5_*~v&0?k05rq-9Y0f^J1EX8Cv_pcK15ie077^7h2rp3Tn`
zP^{ekN-RLuL~|kTI9GSZ3w>~UG^a+bQ$fM0p!feM?(7>P4RnMDd3TO}DAl!Z+tYmW
z1T&gvVCyK}<rV(q^Y#k4@GXp^B(asG=x*WKQFbg(?rzt{t8uZ+At3)xYCTEB3GTMe
zVkcjWk`VD`3P)i>djxXi<;4j2E8eFo>u`cl_VW7r@!44%fkr;QsP%Q(=jVRJ*~>?}
z=?E;~g|v9NW#n<OhVf&Nmhmzf$3;OzOp3b^l$*F4iu3olPvdsGR{fq;Qc#edxA*N-
za{Gikx4L@&c&-Og8B74VC+Nx4l%lWS+CR(QTW8TJ3>~kHpM`~rk#T<15)oYGhCtwg
z<kP?IXm5)3Bm|U@#}#q{p4K$!{8$~(AQbU{00+5NjK&elk?)~48Lg?+^9wBtj#wgC
zc9g}{3D=9s#D8}4%(i!XPbSI<LQNPKg?OG<8Z~*FW3^ROcXqi^Tc9`X9{mtR26E}J
zZ>Q`sk9$q^^-X{SVM8GGOYv82^$9Uiq~AtRNLa(((`h^t7qr0r@%BP`?Mk-j*Y!`I
zK3#rX>QlT*%R5XVU9^0Lt2e3s3JsFukP*espp@fuix2Wn4NN2qVZUpfTKTH;?P1;C
z2-uLJq>fHoUENTYfD$pW2^{A4*^-6EzHl%MnoDNJtJygfy&B`^*C(y;FsckbUe_1r
zu!e`&NqKq$gPw?p)ukF<FV6%t47JrZkBE&v+O0D@z)|ETSF8sDo&}8umc?Yee=Ie+
zsTm%dp%c?$&%<r0q?pQAY|PhHC|$jkS!zAat*s(IzrKuz0X|=TNc(ggzH3pVN2Ey@
zI%aC{8&bS5;TDuDmG5ck3A=GZ&4ML6`mph|24pZhB0`)5F8n}vT9Ew`4Sc9{FAKib
zNXt$SqObT@WGIR-CM=q+Kwpf$V9D;{@61a%(^QW-zsz=n6Js$VqfANH`{wHIw;;^1
zu?hs+XqwF(Z>{V5p9N(lKjZBQIv?vN;{9|5`}FA>H<F&ZPeoQtR8Eciu!SUIV0iXB
zRt(Ywgh(VChW($y08OrgCsQw!&&M6~D3>mufGaux(Pmv(pW7Li+xavoAYd^^1jLYu
zk?venxYrR|=KlGP+hlxoslleGsR<!QoGi``=;Sq$C^tWS?5D#8*wM6EEF?ca2mSmR
z0|Md=0gp>ZUy;G@Lj^un#l*~1Z9E#%%i^&=dgW)<Ghfd8EA2H*oP-IJhU#}M&_JzR
zr;b~ufF1E-0$A&=sr9R?@Zg$x8B6o54~mM-{P#$uBr<_;s5KkQnVI2i`tYe^78^5Y
zTc6_5@h`sOgVHayKOXK>w{>*nTr?fNw$xs$52~-M>u|fXYk~FR0K1GUfK*Ouo+MFF
zdN;eF>yuGc<}!K+`?p<??KSjn8S<dIjv`Nxepa~+ne7o~;}OHt=&^DZ_Dn=vp7Du^
zaQ+vJ7EV;o&(AzBpGPO;%ybdI?I!x~8L}1tO^!h;yHmL$orJ!%YJJ*X^8*Zfe^aY*
zxzm68n3QZ~1g+2Hb7i0NG2M8Z5p%g+ulo|h4vfbU(n3*6t)0Svd5Q$VD?HN^9OvJ|
zNpb~RU3PDIZo}L>5FA^Vyuj2D=l`BdN@gC2Bp~Q)Fe*pXsw+i{92v<BR@ZtF&~#@&
zwE4PDyNJ->qjmYb8At1+a(AZ19C0A#B*<}yVL}1B^Bp-kgnW~bF0s$hVNsl%(?<ez
zIxe-S<gLMzcD-Sm>8&B1uEa|Fn9(q-EiX2HQI1#fQ4C}o4j3sOs9bC2O`it~qRqDh
zun-q5z|MLigLh?SCaP!abvYFNh8SP3?RjugUP@0duIGryYqkRg7k*(;d;p&KUju`y
z@zIO-ew`by_baLUnna>(ThpJwPk;HbGto`^_~CQy*@pIEbko66m{XV{VN!TV(eR2J
zyfcP3D$|~P|1QFozX7I0Ti0U7kgA?D#rNoP=~tp7qrt{^ZE@AVX!j;D=sU7_HKn9-
zSe}Ep&$y_rb(eVjGzLV7pBq2Y^~^t=K|OAMSOX`6i}hRU3K18V@%^brbF(=v{PTzj
z*Su$eym*p*%~D#Fw@?oByt$ZywXQDIKVyGoDCX?tf3X$j3GVBE@AFLR`~G7+k`_k)
zPc}usz)b{}SZ6MzpVz>5o`~z`DncHlVEaZ2zgRO8ws3!=O(JdSCM5W<wLA9F#iG)p
zP38%(k%_+g?%{8fRxt3Sx*p3W@)gHn%iU?)ZY%jX2#4!f<InXuDKOtFmwt%Z;I@CX
zh7R?&H$WIiWH9vB=DH+9AYOdXJiwexc$E@&+lt$KcCy><phuikh@YD|Nur^}f?DU%
zV$hZ{NtkCuB`um(MkZUJVN;Y}5hp{QCFyIwKP|mpF0r`|#wm7EOL$UUTD|@vikDtS
zC6kN%kz5eU?u7wOs2$&#Ds<W~!rf3=AIhGc9*+rv;Ena@^zRAOzuBj~IS8&?3KZlq
ze_tVl##j0v5@AS6d+Fo290vnhNvmmle*M?{ski<JktSQ`PJP_ncR-F|=-yCO5c1v_
z-pMtII5eKW>yK$M`iyoylO}7^k&~Yr>wbaO7Q^VA?~r(&f=Wfnx^-9Q-+&ar-D~i(
z&vwT`?EHtHPP~raQq$4oI*y-njF~XGo>u15`J$>Y)74#E#1nB3ySBQy(FR5d$Ebmf
zuIi85vpU!IWS86Q=NeDFR(4(KB$r2QD7n+TJi}QLqh^02g7o_&mcI$b`$RQinP*q7
zAY`=1e))nLl5`hw_u~L5uJlkuCZqnZ3}}e0agG<)hqFYjHi+@+qrkGY#7FDnq5<DG
zSm#Kr%e(p#>o#?7F1yfFqeh`j)wc~e8g!`O2hQhBP0{rcH*2nIz0u9FJ$#8FZyjws
zV+#^u(f2i*5R2A#9(V}7O!EQ~koM#bc;KiNd;yLcB!;%kr>IPUUk4*qOSD@vjT_ig
za5o>5A97ut1%FzLOG~R72tWlQ_EmNd4~1B;Nf~TcF+$V95w*{lf$Qio+f}KLvY<h)
zt?{A16z;v~L2rZm#Z~OJ`3weO*9e_<nb~3KQ5;8Y<kul(y`q7#ld|W2i_WUgiO_R_
zmmC}VenM$F>KW@ode8o;mL1LW{4w!~9Di%lDHfNcqH0cqN<&>-y(Fc$M_@Cy)`KDQ
zl78&$VN(?rrG}PG*>)uoa&nUUh0)@n(+e_t@e$PxgvZC2r&#L*c}H^fi8fez)BK9}
zw!9J)RbH;kJ;6`c50g-L4gBXuNV(p;h-u~+h)5Nr{*T2s|A!6S44%~i<qvPwn2~qx
zJG8nRZ5p=5qy@$#p?V!T`f}Nn9sN==wYfpV+z^gr3eo4+h@O_|l-q48C7I8(8c^UO
zRj^4{s;rFjtI{P#4$kiC$u`Y@C}i7s$3XUS3_(Fw&@J@5ime@^+mz&I936^v<Ww|x
zUVZx4d-d6wkmv%!;>205`QLU~WoXdDzBAFw5ZBetW4k4DGW4Pq`~puuUVOh@b6u}$
z*=OA7QisM7Fh8;^b(WwC75w_zYj<?}>FSH4Igow?+HOZaA%u1n5+aAXwsL?oYS555
zOJE3d9N2PzgC=)G<gwUsSb_>bw`APVpBTay#PkSIpi+8Iuw`V1SalI08d_QQQ&Na$
za@HhoERBc9XT;x*4pVh5EF4V|(o~%ntcM(e^}hVG{JyR|h{=L!-?;{ct0p4gP}E#S
zjPv%+rPUF3HY=x)o{~T33q|BtJg7}6+Rs2aCUbapZ!_8eC<?4j?Q^dmL}FZx-d;cH
z*=n`vLtq=b%c_=({z%j7>Gt4_R1?GKk1Dk%t9cgr^jc4zxS{@WSv$(A-`=Uk)W`Mf
zkQEih<gt(h8+&q9x|PIyf(<4ZY$x<S%dgzqXEk?*+TGztvZ{<IV$MJS^CjfL2R*};
z932J9mI=`Y$>gZcq=cS~Al33>NzG+6+vA}otmH7Fs-Me_&p-N);S+u{yM_2gUQc9?
zO&*?G!d&O(?OePbd-tfQ+-Hl#<%>pz<-tZ4yQBzAxADM&#C6Z^tL2iyj5OuJP5l1i
z>o)eZx_Kot!^#YADjfYR{1Y>sxk2pKH%;ESz-mrcqO0TGE$5&Q9=)5KEc@}kKeMg&
zWPbNEitFb;=5#djefy$h#iQLW>;L4)$|$%8`b%Wx5hh?!Gd~JW*R(Q00(WakwS%+|
zt;SH}Ebc|_RSE7h9iDZpmPW(jaLtGfALC+(5PM|VxoWe0ES^`ZNYI?;)jmVapD7(*
z;EBQmo+!aafL5!lv_c@rM}lk&MbQzB!eud^Z3Qf`O-u&LfU{<TQLc-};&y$k+jhIX
zqeM+@aWHmscil}B(_*o3g+gwvzink<QKc_(dAhU(*k!=M!D0rzzKK0Q(E?eAcI}`~
zU)a~KQA2j?-e`P0Huu|Ki;FVfT0eWD+nAitq5uQAdgr(0NLn9+t=0(X>3Qy7j;)Th
zR>Eb4Rc?fAHa&fw^{CjtNV7iR{3V#%cp`_5#j)y<;UW86pm8sSVVUauFq#Ww4K$WA
zz~i~UD+<<Z1G%iLtB&XGNeA0y$%cInw1F%{s$4}eB38%wNhQmPYsa-EI9<<O5{inP
zEZFRH>aBd4(H2dNHjDNZ)dJl9z0<Vk1KKLo#mH+Xqz=MCsyHe*0%uyQBiE&db^oAR
zY4~u_$tx*nu?&gpOdg0m2!Vz>3y5|HdpSzc<1#KR^07x9d($i@)(g{o_tWt!7p#lk
z19fU5Dia4$nl2A}QDRuBVj3wHLmqe%zlh?_Cu~bi^))QHYzk={m<PRwew%RPx2`d-
zYfAXB*<aB5gqODrz%SlW5Fd^j@VX6sk^+;CrEh(11pISJM#^vK_}1+C*Lu`FBz=Ua
zX+^M)GBH7R;McgH4$r12@0F3gkBfv~tj&YlzF2uYI`7B47`))PCLagtxFx8lzI;!p
zsgHQP)LBU7NNY2I_=BeUIn%SXbKEdJ{Hgk6Vt&KR>Bn{@;d62B{f6wbdVAlRVwyB8
z&pl$v$@HHet#*4p7kTgO)l7AcBG5MY7rk~mW%2LFZmp!0k6c{Wp0J2Eo&5hKR=ufA
zF4nzwCvv)NT<sPMONE7vwY58`>;&PVoK+qVL<E^eQ&SG)vbbiZhf{gs%U3%kY}OIs
zAsq7Bo;y!B4+i}-;<9|&Y6}d;qum8^2nP4wC#v?Ar;5;lET-~$Qc^yQjK@ngG1Jq?
zV=7Wy9FIT^GXZBcT1dmE4A+Ymjuod(bf1H$^&V%?5EmGJ&)GC-VhC;;A<n^AFhYI(
z_~T)Btk0pegfB?&1ucOBtNjkC6=f{d$LBdm?3S;cW)|dW5lyJPCVHzSXA9B&dSPu%
zSJO;xXiX{Cni?WUc;WB~Xv%?^Nx@Id-2&Zj>{3HgGI}S$$(-;(p(NcewP|j2TXAY`
z4=zk13g?oU6rY`%j35N7XZSsbDGm=zWxs9<|Eap1sr{RK-Q90hu}0p`IJAx4v2p>~
z1<G~fp$;6_i~KXy^n4O^gHYtR)<E!(2fd!XTb@K(4420h^jVf(GLv;^`!q)$0WDo~
zbN(Kr7MpRrWeaVBwXT(lE^Sjw0v#->D|K^OZyBXw+{9wC{hx(W%n%PD&KLZqD|2CA
zM4ts8uHBoRAEDHU&vJGB{bc#q;~vk~Va0HCe(OYU$nvK)_RX2^cj9Xiv_kPcT}}oN
zwj1kYVkqn1ZU4ai9fFgU%XA<(ea~$s<ZdBzv@oVEq${PG6E<GCVwYR%n}K<nLr3dO
zUUO-=9GnHRR%_IUq31;>LCEE(Bu87C<h4KKcyfNCL&JlxDnnh~J=$8C-$k}x+@2F3
z9E0wxvypk5@u?@;Pp5e>#O0O{1MVDsc4OSeQgE_K^g0E(H}knp(6ATBBd)%nhVXBC
zAKMB?^Q{&78}Lr8Y>RD2mXws7tgSdq|51R!c)q(J$jbWa@^sB+8JN2h4}mb9K%2_q
z<Neida%|MoWE2xXN#r%21a!T{1-moeUg+l+*zvgQ-Ma&RA02TX9jSqV?*IO+qSJ&7
zA#jH%?BsZ>BL9oik>ZPAaY{*Cb;Cff7T{uI`gI*NhmIe1%>@b|qb_WWzs80(+3=M^
zg#ej4a`pDUGntNuSw%{vn~9*X8p@gozJ&!f&ONEM;|z@9j7G{Di@mPbB@E6EvZ<Jj
zet_KK>$ygKY>*%RyrkAyo6A$xYiT#!L8AT`*k~>NX;YQQ(v*VIssH?ARgxu5%un)F
z-WOA`MFAdsIh#K~=#PYrLQ+6e<FORuz+I8<*&-yZh%2(f`OScPD#0~Td4ao>PhMlg
z)x12>XhHUN%o677x*%xpO9kN(U_qXugs-}`T@&7)a`@f)wDQ21I6TmQ({CA&qsB~J
zQ}^SwFNkOg$^`WbG%52$i4s}k8U6mH!{YrANl_s{=<gns=2Lz4BPhB^SJ$b>KP`47
zE_eyc{C30s$iPX6Xv4`Q-J|+psD<M~z*=<KcK~HOV{bKOe^N}RePe#4VtrS@HBA+x
z?=;Wg5Z7>MZkBkWx|r&u43<^)1))pGwqmemKSlBY%u*u2$GYexvo*T$Mo~3_M2OfZ
zk$pEoB@+_UxV&hR|G7Fl=eF%ADk;<BwCP7q84<Z`_i_*iZYdopI%BuC!Ztr@|F(^Y
zVQ=b?fYc!_D7<m5HY;{U$biegVWPiyTRY(3yt*1aCi(@Q1)pbec3~k|9F>}q5|i6m
z!QTG4I}k;+G-+&WHZ}R_^XDeHj8i@NUq#tlx0hS(Yb_JHjfni7uKdh3_$cID)D)P{
z&tCwqYyf7)^fUh|-aS4pr=gKRzdkoMCvn}U1s&odMTKYs0_@^p|AC!j3vB3WDM*sE
zlqhWQ>^c=VXmfjEqU`eg?CgAn4RMdsu6Tj-x6vSMi6JTdlJAgYh)|JtG9m<$$&UHW
z<zatHg}qdGol~SnZh|kn5W7X89m4tydr?((5d#Wl7{1__(-hOaD+}h!v%o4B>MO@b
zvaqpC;b2xvR$p$sJQDu%;?a_Ph_*IqqXyY-nr%dF;yBoJP!zI+H&aNE#{PI5cO(}g
zl_=nCK;e&{XI2jnN-ecxYKu|WtbHIeUck|z3A0G;F3%sFq5y)H(T-Qgfj9O<|I^^(
zk}v5JG-Wo(<S2}A_l$E5^9t|kOANLvXeyM)I~sHh@YuLsv?V$F_bd=_W_ATS?0l+o
z8!7vySrkt6EsY6X{p~0z)%rB(_RKZ5X`yuWbG<0h6K#W>yUdGC1;!I0H9CfU7t$_<
z#H*b<2(}FD`T>DA?ON)Qp<UxCiu(8vJ1F*Hfb-gMGHyvB49FYoLo|--)?Tnei%mJc
zQIl!X)QDekU4!23oU%4UEyf4tPx_5aV5i`GG6fEb)fMqoKV1cf$$^HG6CAs>rI5ny
zSSp}Yti!`INlkGcQ1+ZHbPElIC@qbMh#cVL)(Q@$*J^R@na*1o89wjtTT`zw_7UvX
z6%sltkeeAB^LTuup`@>9wj(arX!Sx0@Bpls8LJYEga4G3Mf>^9UhS;>bcOk}P5UY2
z!KHfZl0&+qDjdh}GADe;mLq^zeRGJ3Y_5Fyy*%#J3nR+9&`J<V*;Z~~e*5vJ_B#JT
zRNj($y>(;ofX*Vr2<38-awC@LlT>eU&$6338ifx!BPKNj1aVqNO@-E{|5<du!hLX}
z^CS8@R|A<S3<4=cB&)RWO=|-W8X3}L(~>X+O}UM`=EmlSW@f}VUL)}f9fL2G93fh#
z=_QmJR8;m4CS1-czo;rq7SG!=G*ovCN>kNVT$$fy>`iL=dZcM`=#1lM4e9aHRTS7g
z2O<^Pnv;r$vJlHls%xd>#vnNRgmNmL9@r!BOy1gC=S|-@g+z{OpK^;+lg_Ieb~g53
zG%<yvvA*`D(!MeWXBt1@H8K|l2eoBCC#w<E44*J4u&cxocs!mT?Ve8MyHkwvzVMj5
zU-C(vNsqD{)TvG2ta9g;?n^F1IJq6g%O{?Oah()OrCcLdWq{2Y_6A#bgn6mwx;sm}
zD&G@zY!_}IR<yh0y(u_C1E7U4KClG{dogYF-Q1SqDJ^EE*GGmV0JB$3B_*Tt)v<+g
zk&;5wpujRYNy+HwjkSgf7b`3O=4L?|nVm4SI31lcj<kGEhn#X958*t;w6rg!S`DV}
zuYiNA5#V<57@ZO~k<qq)(qh@=2Q`u)vj-tEfCyOT)JH_j-k+6xEzHi1mf`-Nq^z@-
zyJmMRdEo`H-tuWegfhDV!sTGsTbFCRVN-qQTMuN3L6++tsiAODF#wU^1Hy?vM)((~
zF1$0D1Q>3Vqz1#|%=V}}$v=fe$^+@l*YY^%w3mA0fB)vgV$#)9X`fr}PU+{wh&!UW
zf6eX(bByPpXg=FYr9OH(7u6I1YlgXQr7c>&KeTce$iqxNG%m&azVv&17d%}yJ`0f@
zFknr3x0{+Xq1tAl0{O@;eit&E^X|xQi#O^i&mXNmKcAL5(J>plWsZh8Dgm+@zq*-o
zLv$OD%OFT0#@rWbsrQ>b0r^{X^Yg2NB{=D*G!D10JYO#txKmq<dA2jp*LyozM(`eD
zcXL^wS!-78c*iql>TSl@9{jwLx0W`w46#VFIt!CV>UjbGFkQ0sj8|>d_|EEdn;cX1
z@bNAwMf<+anuiO>+S(b|zsbdbR9V<)Gs1LTArfRra4Nl6s8`NT*X+TnvPH}0z`%)G
zGlRER+)!aA{*LeNgvfrASg;96N#H)zzG7FZKh)4L$ynt`?vwY0t#9_zp}($Y0;sT`
zf1AN{crY`&w0UvChdrS>bEr5`7G$FTSH#BT59CFVQRxxRh8r%#iwm$`EpdEQwu}$u
z;HH^U-RtA+-psXE>FZRLU^?N&LUV~&bx5@U>5vf%h|PB|&NHacPwB&jTyX=OybCMx
zXCJ*dB2N?0ZyVgZ@45M}Ms#7_jcgnC3RZDr3E}$6(_>#DNTD>9yH6(luaI=XFqW~|
z6MgH#H|fJZIuiBp);{$5ZrEvjPL?~g1{>DJwJ3k`|L@E&TxD>xgA1&)hu^XzX`9=<
zjy$u2Rtg}_%sRf-n&E?hpgb|VI&?IGKpJO!>k!}aRv=q_y&e&UJdW`GMgvGO{un~^
z;s9a!`1rV5x%Sav+oxBse?Ix;VC!8FgMn;4W}qU?_Be-DY|FlpXd#$JvlE<yt&WAu
zKQy$VJ?!*?Hf?-AAWe?z_q~9YYMLYw`aBzO(JZqXf4Z;E-C)n^`Lqp(=msDoeF?N$
zLV|t-Jc|Qcl~a1|hH%0D3aIIf2L3fQktHYJxYFmQE+j|@k*_{~-qY2U0uDC76?iv#
ziN2S-$<@!Q%r!X7aq!>=5q7CxO<a<r^}JV-frNLxcc}PqlR#5G86O}2Ec@2lfq-~%
z4}x;xWS1Dt<$OdX^~3fX%9|Xh5DQQ1fzi^vOT?Pe8m!9h;-F2qN!LkY=hm@$+8=~G
zx4B4$QI$mWlY%@70Wv$EGJ$zEbu)@YY5~G<1@Vt|mQDn-@5FGH>%s)k%InIhh_5IJ
z?f~o)!t)n%CL=SRFV(<n!}#d~d~k2e=awJ$7va;<9m`K|o93<ObnhrXTOlAnkG!t1
z@x$%&t3bXnvsFArz3-`x(Wl1Q${X3S=7#1K=k{8ZdPUU*d3?|uT-_BQGsUtI)Ba}J
zTPhuAA_)*A<!w%>w)7<VXN_9CIMQ#Od#b|4aO=Rg;W9qna^<fsyV!qcKZI9$Acp>Y
zIg=FrX~)9k8?fUQMiVb(V#m?3(#&Yfpe<%HPQYn99|Cgbu*$`$rKj@Uht={nN-2>q
zOrO7w_TMKD+@Li)+{T_G&6#h;j_3RM$3tq2c<Yok4SGv1g0u&49_<~%UmhMH{`uq|
zIVlE`(qFf$l)h+vjR~dU!s)iNV~BF$>Ol;<-pBE8LX7$;goJ7_-sOrr%ztz#Z0Q?r
zufP--K*>@Op4xydiip~BVw2P^)D#{&0m_a^4ZWvNO0Mfu&uJ*fK2sjJ)W<>O-LupO
z1n-9zC$CTR#5oFMC+d(NCMY34{rf`C7`v!|wcm17e10}a+`+B8^>zJlW4PkG&9}dA
z69fh6@43u0qwu=&I3&R2{3*d3?ysyiUv}%D&DMV}hfmvSqRL0D6Qc;t$Qhauq$<ge
z@eS$!SexLWpe)lb{Ef>l%aa?eD7GuIAMPdd$(*>NPEKupeoXx5Dc@;1xw*lVLIA&@
z%ZCCb0UEmZKgGw8|C}&>bQ?^fxX_0QQnEF70TQINS4%XOUHu+8S8z_O6sI;c482&r
zbM#$T>j73)ygic#Js|QkFy?Y$R0ima34+!p=STScb$_vkno8DoZx;NU^PE-Wc>ff5
zR!xlBbV*`^&rz8t{icS9;sF>4(<ddzI%LZoUiW;Rp5yo27LJj%oawnUTEcxE{XKpS
zb2O*R=H=o#Hbv9$te{uc1e&4jm^*I=o$$23KV8UBhkHv!!#gr6$CwcLT;@OsG2lbQ
zLAidxA>6h@1<eTj3cEgYwn~4q`8`Rp*8oeN4nu21P+q#MNEkE$^r}CSSQ=iC?2;_L
zH=zkpEmm^L{p_l^Es-d{nbwhDbqb<RsgkVDaW9;S4Mpa9vf)vyucgu#FHuE?^e@HU
z;af`t2W#-j^Is789PV~7qgX0&9@KN8-5BNSexj>4^swG{cALI8+<{vMWe07CELJ<8
z4|%HJtp)?;@2C3eoR6`wOa?nw#G<f(tPqU9N(m$<#RvF57=BcnVvfDjsFH(+)g*jI
z5R{#lgUs)ItzKJQey2Hs9wu}*5w-Ycob1NK@<oirT$1ZItMq3rcZ9??A901<tJ|!;
zQ#%KZM7V|k$1&BQN}%EMyw$f`<)BTP*B4plQ3|z6MN1y4(J<7RVM%u3*uRSNB3!Au
zwKju5OxA}hy;l5(ZbL%UJ-pC0HFHFka;L9{pq$RxAwZUU7P7a+tFzamr)q>ez0S#S
z(2}A5Mb3{^1`^!WM^i)k%gri}4c^`cuJ)OQ`O#@^k%rPURz8|Cp7>Dhan7ODvirRy
zydSR56(M1Pu!i5ZRk2MWMPOa=w(c&;enK|`?)U5sF`ai^V>>wiqj@c<%Ao}6Xu+H)
z5orS8!t&Y-eQ;=Zjehs&d-49!vst4dOyw(cz;pNixO&I%I@hRMxNU6Pjcu#3Z8wc=
ztBsw;wv#4}ZQI6*?WD2J)4kt)-t&Eb*PrKFH|9O(m}AVr#-#FaREqM^kM=Kc$#qrK
zHz?R*ztxeDkn;O?d~mR|@h*y>pacR!W?vs=YO2T1;3{A_STBLJj>n0bDyy#eQzDr`
z6A#_@{v0rcT&zk^%7@g_xd)VlG$8>=393xkvJ!(I0XmyvLaXgMJR>8e?|cFnlspvp
zFL-GC<xW3R@cGv{JQSbr0FC?a4k7~=No5F4Hh})@c?`scGTRe*z}<?r;6&nnl~Vt>
z-0x%UIj(#0J1>8?2<?!?;UTc!2t;N!f!$($8wP>INN(0X!Jky;bh6!U=BBsd;dd5B
zk5whdkzoM4(_~G7dfuqj`^9A@{7M$Gb+6}3Zft~LB7<`b6Tup<c?F|gMyPom;v4;E
z8qflkqcY7Q6;_Q3R_kfb`rnI$8AriUoYddnt*Zqa8~XfMvM+UnBEfipT~tXaV)*gu
zExxPm$JZv2-y=u|OfeyzZYuHb_a?sC9?4nZXfUG$Pfcf?HiZ>z-EHOmT!m^`^^G&O
zrkJuPi?2`5FV~&}%^y}XV^}8IMs6FKB;ORX^p$Vz??Q{SN0Rltd|gKDQL{M5jCTqH
zJ#^&7Wbc7~wSY71m9OHQxf6**oN!6LQ&!A0r&5mVe+4xhNnbiDmYV>bwT|jshE8ML
z%nS>lC_F!EE@^3b`*U;)Yy=_q4A19p!5tl~ZEdQOp@78R41l9&XQKU{RIQCzExJDm
zf(rqB@AsESB>+rxyAS{jW$OaL5u98k2DNk<TD<P#NJPfcy!w@_;~Df+5s5&b`u-L+
zKv~_XAJMyY81*Md$PmB*><Z|qX^%MO^?ScWMF>X>h!%jn>h15HSF6EF*}_}j3Sxis
zwjVSf6da5<81UGw&b68mQtJ7cT3X2xZvuQ59E8(m5_2yPLzjoEW+w33W%DM#grr>Y
zE=PrgDCm_BWU$o_M$53-Q9S&uV_{k00bBAj>j)pOFmVB}#gVI)=UR*#28xKFY+5eq
zx;EWFQW|N=S6akhwEO){)4&fm77p&mpb)u>tb<gk(@i_V9LBr3-sJV@fq`_GL+x7T
z!<(dZkrm^a2_l8YjG&P2+Ug7qZWuDq2b3e1SXa3CUASbR@wgk2K@VeBDObJvyql?p
zC1$0U_-FgrA>`bUTC;rQLbHd25l;>WG-eFkw^VsDkg}>UJq0Kqfb8Es0!_mg$u<XT
zj@dQuo+?93JPy&Z4DN&OfVsLlBNLPQg#|PVwm5(Pm!ny$!UD4AyW8s<JZ7)EUu$b@
zdh4;n!%&kM&FOL190G5LS^N?jzd^~V9=`iy2ms+4(Ch`}P(a7II{#5t{K@Z3l{Q-8
zF9;P9>uP@)A0N-<$%YGHx&W#MTtrr|RB^-xgW$ZO888wy2WdI6_PVyYg{gLHisDwW
zMWgfyyN-^Tp_<l+qK&F6j@gB$OpvdK(<R90cY{GpL}%T*>0IGn4NXmb;w?+KGA||}
z6>SNuATtu0-B?NQE9WScq+G;hfI}P9SeWs4dVw1+7D3$4_>6ky@4_V0WENa&3S%*!
zX8;NSDzJ5^sp<mnO3b3)vzkxtUmpi7ZKd{Z>tuZo`C{=v<3S|Tt+w{fCBL}(m+R8&
zma^F+e%>fBwd(H}yB=EXZzY_*W-Xu^=N%Q@h#<^CozP@^B^J|&MLArAcxXxM_a7w_
zfxf1#ODrzR#suY%y?wZlq9j_FN-d^b@-6x0w(55udf$~*);75F!@UszJU?av$LSwP
zA~`7uK%u70(fRr1q@-5j$kAHeF2X9XxgC!SWsm@M7?))NZMt6c@Z(%5Fn-}dLv@#y
z3clUVZZVpmcxNX&HT}LX!*hSE`tFXfqa(ho%o6bKgco8*CbPA>C@!J$da!4EFZ>`y
zr=+j{zfe_Y>c^X{wLc2wU=2)1bW&O{H(L}>?e)xNAP>JxA)?G6TPw>l+;7>|jsVcf
zQQhLKGAT8BtHWn&oI}taR&Cj{?>1SJz>eYU=?bHL(rC6k?LU@>fR+4(iP3TLk$>@?
zE&`8znFNT15yMFd7)Ym_oHt^Xk$G(J4&|d(ZWO;RzU6?Vc-nus!vQ1<lD{qAG)+;m
z%{Ymp)S-7>qr_**9(}Mku!O}~exyOh23Fi(#SO>%H!sHn?K{6N+88xGcOLUoC*1M9
zxyVl>r5DaDqBZv$3+EjsYvKixB)HPktgfH7Qmu`jn){G2T)YOfPex?+)fc>on*I6)
z#WmznKW(p{1D&n~0|H%yP|h6szl~XBeh?jAIyE}117Pb(LvxLf_?d)+!w$57L=^7q
zth%#ZSzLgxrk2Rcad&-7@B_o;?FELj(PMcT3z4ub@->qSWxCOh6ElLIn)>2^vokO6
z=~(TDyUfz^qm*<wpoqos*Bqb!cm?^Fy-XCC3Y4sa^Xl_hrdVi0qAvU<&_R~v$LMpr
zVPxNY<V1X20y<(RG<&H7)}B%k*^}bXH0FLDjDHa)W?Nv#^rbSKUAFY3NvSgmvQM!d
zEB6TM`3o1;MD!Q-qtEXlBZl|xl!ce4%3|7yvoK)Y*d#JYi=qmNYD2wa{(~FqNIq=q
z$&z(-5`(^XCI+5vCvs+IBVmh^qKKcivk#9<^-9vxyxo@Pl?CW?Xa*y=8JHw3%Ga=b
ztwO=$2k}ApN}w6{O9I`pcG?Pk$Ft_>j6O#wH0IBC+6yOkH>Bt7Z34sckz-ykeXW^V
zQDJVDI3ZzVQAe<ScvDBftAi;Vc~vwpD2UH#8=I6A?(hGrv^2NAKCI%bPUCm<{qg*4
zU!R_~Hj99-TeiT?$uX{;9uxt$BL?f`&`^+=m`4DVLa?E33?WZSIT0WB`4LW1w!zEc
zG!ArG5b;WliPF~A^XJc|47%7;_|$qcBO`iBN`PAO73gmQ5n;-rpSPHA=i5pyuIuWJ
zy9n}yq5WI6Ffzy^klqMxsXm%B6&^7uko>eY`aWeJ9r<(_n%xkSzNy$q2PT?ne}>Bc
znxc9u@zUc11*0!F;iS%NhCk)zCgVQbz?zO|8f523me>UQ$6?)MwSTJ3zkNuAhcuw^
z+fQMV^#@ORwhy=Fb>$<Y6|0=sM*cmVRa!$#^+>s`wmY{+Vt@r;_ZGOeU19r*m-?$Y
zS=m(6O(nSiB2x;#04VoLDC46_WM+-c6Q&}Dm&AMmYMXQHi9u&$H3>8JAU@DgzJuO#
zdFbn00}yCF1Ep>QGD8_Rx8VEiY_XjMTTP;x9s1t>LGs;s1ypar@$m?tz7g<g(b9ZC
zz%#WlH7zMCGdEqx{6cJ$99>pcmXP2qJ48?CX=!=v{gTpaTmujPrwR$2Hlh2gfOkq!
zW#v>wy6tF&@de0-81<^9MK4XQ?dHS3`~&PQw(aBka&5?C{JY<i_%g&O0v)u-@dd-w
z!#R>>ycNDkOoS44=4DisK%W-K-u^~|uEn*Xp^fy1x0H*VYjN?ZBg$6s`(!jqgs^6g
zwscEvLp8K?);yhLEe?KID?E)2*-^URK_rzTZsTE?PqUK(9xA%|BqBx*Q=XW-Bl9)M
z;lEDabntY?<kEVp0Jy+xBu%r>IUyhqk_8=?FnBW?oA^^79`QBM2#v^29mE@)BsOo_
zdAuImV?@^c1*xL4aWu<cBnz_%B`=Z&mc0F_15@-S{=jJX!{r7=_BHCoFF~aTtKOOB
ze{xUxX!%7ugA+TuiS5aP*HPq9aG}OK1zV`b77Fj>opBwK&o%*mNQg1NX(~p%W8wOq
zpZ|TKzW<gQ|E=lz`lw`whl8v2GfrKT`%TjO7+-@XY$`*hg!yfY(?FFqMTwMNNBO&Z
z1oD5I3zP%rYslapllNN&>HA={q!XLnALA)oJv^g>h&?4bDN%wnwV$N4mJ$E<xG+zH
zm(7w*blp92f|XXYZoe%`@qHZJ8}=NX{L|(qz}V}(vwxiiDR@KL0WDJW6OBP2^`Zqg
z^#1<iHa!{eNrT>9d(&(M*s4Ps=%nD-hw49H9Tzy<Xi+~hdUG#hxWF+Mi#DfOjY!s!
z0jVy@f6!~S$>_q*pPRtAAb?|Mz&~p^7K4j>Xv`A!h1kCNGpE#n->1_l4D<v~PlmtI
zu@%D5zg)<hXkg2JX_@wLc3{iFA0h=RFi#hIur+4qEB^kN!JU^8cQripfs<+Iz2K>L
zF!P45BR_=K{N%7|RniwS<My$rtj6r=p}z1uP8PlZJ-tsMwb<`vwEC*Z)0>w6k_C{l
zaEnM>t^Td7bR<6;B#rxN!lT$vuKjtU@gJc1@`3Yr%Ma<UQ5nTYEF=97h`&t4d)@<G
zjDiy8W{>EiIRsNY;J&npK?A(nz`S6g1E)T!Kye;Q7wbg;w*3h34{ak-Qc|>PO<n%z
ze*ki`zdI0!Vwc6Yvsu&T_&KN`uXIUqR?%RVIG?O{FHaUSLd;BNM2aarg$+hp&VwK%
zYN&LmCY25!QGs}C1g?BZR#eX*sNdM((P3^$7BMMRIG>IXEw||Rai@9YuQ;}S5E%<I
zz?iGKiUq1tlf;6<s4>EQy?QV6ybSp(k@V2;`;5Mxdv>}6xKtYD-tmNgIM~9dH!Y4r
z$}p)P`gu>V0|@Q9vY|p7p-EV_Uz-XH5BFjmjlCfP;!lg9Dows){srpx$9057L}20U
z?72GEc3vYk-I<dOcXg(-s@Q=|?f*t9$WNoDN`ea0+$;tLDGNYPeXT)6;hb6De>0SW
zst+Z>{PwE^0z9KLPMO^Lq?4uYKu!`BFp;J+sWNvE3PiX?W%<fFYRTzY7cA4RFZi6Y
zqxY43S#P@pS<tjrG!S;MYp5-aZ8@;5(2D)Ty#t6Vd|sTg-!r{~Wg{-_EOU&uU-`Cf
zpK{K84UG;F;Mw&G1vtbtPd=bVEp&H^iT0;Q6Z=#CtdgM*`u^P55AzX7LLOBoy$weD
zdaIg0scmO(wSMTz{`Bu6=#7dh)22ysr35)vTcvU}C4Ef$OB()kB|aTV1flVK)`gGQ
z-qNBkB=lIWXsHQ?((HQX`~D`btbE<cbFI5ejEUu^tmg6$=m#u9W5KV7nX6c!=Z7e`
z>Uq^w;S%4FIlL0@ed00J1||T<7j4Uj*J3hK@<BppuR7Src`-xR-Sc@b(=a5apuf%c
z+V(!KmvZBRdYJyI5q{Z~a{Dgq*-f5b+s_t8&i`{cS$mIJd|Ks5!JfV-Dj`5fM<wAu
z(kmG)J1^2JOe915T@;;qj$UUgY-FP%K~!+}MYO;575GE8Il6V#VIQ58plxHaRec$C
zK{k53+aj07p~AU8p+8e&{d>^uKKzp`^-qejW35qdnJNsh7u%;BY(G-#zOl4=(Aqjl
z%cwdD5&nf+aHDE`ytu&U{Tz@6FiQZVZNG=h$MsfT3=G~__kfappiXGK^tlRlU<wJ4
zJ?C8f)-+7q=sWFXgz$6->~R4=F?_NP7fi=)u_;?$`tfb%8{t#DF6C+rlFRc0hh5V8
zOf$34l^GG151m$()xZMxvpv^ZIB70#4`LFQS|xWiGHvScZs4<og{xY6(-_(oF*0K)
z?wD!qC@jsqa{}{fj9`3|jqm9M)JZ8DOVJ*IPsSV9KlBZ^R$%;mjd|}vo4+t>EU}{*
zab&nQ6Sn>+!~hx$$q#|(fC^quJ70*u@9rq9rzJ#YSl;={Ot|Wc!s`t9|2!ygTYc${
z?{E1LSVWRiVIE$*mgZD;YwmrEZh^tU3-Of0T3Q}E0|<hie>{OihSQeMlx10^sLAcf
zzh3AH@SGKN+@*;h#^s(FSZUb+Z0C`|sTnG}u9sHtogCAm*q1r2Q?`zd9>nGgREjId
zv)W16<{EW4=_!jp68HqJApG0L=mAC~89GtirAJgv3!SSc^**Jo+=(wq-krq6&bq26
z&S`Sr0+eAXVdXVF!6}!x_Q{^<-)O8w2#Blp4#VT5_jcKyo4c%i=3m_K8hyb(`S#Wk
z?1{~nM^6Tkr1Xlb%Z1LIm9jfDQ>J$htbEYabD2j_u0d60hK-e#Y`OAYPq1kvvd#$y
z1Jd4CB%of6YSiMs(~rM#VEO6F34MgljL3_Mjl|CW1xLZP0y=Y>zajsAUjxVxCjyI;
zDy8Xn09qXL6Q=b43pxWr1!>*2AhC#VpGgBQ);xebdw)EAm7gC=O=B^WACKGqYfgOp
z?b9vm?XAS{=;&A$zwQ0GQ(|I0qXFUjpVFZL)9bZnfh7naYk`~09(;VDYxlaRF4{vy
zt-IPeBOnN9)^!GYf(;Fr<7z5d{4(sUH>({!LfYCC`;W1)xtXT&0N?&^lc)!Yh`l&W
zCGTK>V#AjZpB+Y_56g$@%+MD3w95O`+6*l{E0<JVPVa07gKtD>yw&ZUFkU@!qw#p@
z8LOH1qC%?UhQ-v^ddkv(T`w_Xk%<0O?wdvy)i7-g7aB!qTb1F{#EgPTEeE(cZV51K
z1vT|yQ+a0Z)y-O5%Tej@PxPp``g+gZgk_1%>0!Ve;lH`W*+A>5?!kAQ=JpJ-r<hI0
z6oPhMKXzSxbj*TZAGkYwpXoL}{AMO-7wP<jeBo;9VtI6JW<mvrc2S8bUHAXUA_P(G
z@YBcZs#2JoOSFu4gHu^W2>INzl3aiN`Zcn`qbM6m!^jAj-t*w#40rgnd;3U$Lrsnh
zdsG&4(r`a+F4x}2#c!;2`Z;fOG^v%}nVM#$bF;Eg7H@*uNne^f3Qy146BCS)2*URr
z%iUFQgaVI`EulhEDJUR$-74|eEo*WOlS4z7zqA3O8{p(1+sTHd%8fOZ-25hN&kdi=
zcnOvs4jUBH(`<5TDQlP+K9otdAW9;D##uF@NtI4YO6&rkG0P3nXSDf*;^%vR%Ueu{
z0{!rDVxN=}p%dS$xGks0FO1Soo0%^m>%sP7#7O<FNZ@8I6&^D(Ak}%lIdjo~tj&}~
z>#4o&H8~nBzQBQ3$c+=cPsbdHTX^1AS*feN@@<`HW5Cf4A^Ewy_jnjKbXNNcWIcg~
za+XIU2PW2-X}h^d|J6Lef_ht6G^eH_MTt)@F80ZK9nFOCd;aNBkHp7uK2q)pf=dT@
zp2Mce>FEk7Jo7dq=<ng--Vi~#_vdFT)jQ6@J-{T1bLgH-Zl}on9ZiFcuClT;S`9=I
zMFeoErPXH|nh`<){Bgf9p;SJPBT)N6X2#R#kfdIlyCB<~*W-1LJOe15fnqGGon_Ew
zibAmXKhm8rpn&OB>Zg_k!5i|Kh$}G!Nn2Xxkm4r@lNy;uQ#Xd_=M2JAHWEu<_ZYs7
znK}?-=Y=a(jZr~$quA3yrpuBTim|iLKqP-e4GQhuYDONh{su$Svxc`C7?Fq1Azc)k
zc6?-Boa&rob5-f2NuZ;p0QHv|(Kux<81uM${h8{e=Pui;&1(rDo$h?iMu{C_pchrJ
z0TKt~xXJBt(RgtnfH({MZ<Q`(%wn<8(GSR^q5UN#L&vj;GShW(Dk@yJ`^EtZVrj_`
z9-fz63e#sUuHEha_{vJB+FC|@KKIA2AET7=zs^<{e~ycjk?*~}>}#m10`e8DwzJch
zf`UHz%oTVc|0Mp$&2H9YM!`)-L-4J&j?A{NKZuYoRATg_*WKsZ4K7%3e0V&OR$&0y
z&DZTrwWs~l^4EEn`i;q9KU46h{c&w^WlLj?_t-X=fi)(bH%CaOZgpg=r~SIGZS@8f
zPx4}Z%;+?a-G#eMG4=mbBMw&>wf`zW`?{npCYu)d9)t?~TOz(IifS3Szb6U`fGJe|
zDrp0;cjSO@^`rRUGXa5%iHVlcPUXS^Agc|~@8AQqx+##7x_NjEx4OCAAF+0HbWD%i
zKvS#=2*{^%(9_Y4;qd{owC@)nJqg_m&&#h(#RvW`FEbJ`6JL-`QNGC}iB@W%wfeq}
zJ>Gz&ou&C7aSDR-SRE=FkdnTZlsqmKFHS?1`aCqan_*~Rk1dtZ(0W*Te06i_j|L6j
zeE`t+yF0`%Y9tDVn><b?-)3YQ*ENo;6WfJ)4M7oo1J2C@5@Z-6kZ+6Yp}vDdbxn<<
z4A4PW2Le}I=z&QDA(pWFnhKH+Yk#&?0zjWx_XYZ=FF^nFU&xF^SJlZ;A5;uba7tCB
zfgFo!WPFsCon<D#P+3h)OkLe~qhp<p?j;zZ_HebaH7zY8NKAo`@W!mEtc-=5n*hyG
zW>h~djfY5ZpDC3N42=JW3@i2+6q0^PDfQi9?z7804h~UaL7%D`2G0|8Js#lGh1ru@
zeK7OGi+X)NF)5OeuTDcl3ODTgdm<sK5&*Ws0Shb<jsH(yT*SkpxvkB6bo9N+T|s`x
zPo=-7vLZ=(uEe+OUpDf8+J>&eCIIMC@yi7V@&_oO=AE3S?rY7MXx7nG=-xTRpA!;M
zPP^u8l=~<M0o3mRV}y+jTx>Qd<)8XAB?xaa*{u81M<HB5GFq%~4-F6hJ%_^gzW(ba
zW!5{?m4Rd0!IkZilU3>yo<+X6ZL!A$et@RprId(~DDBmNx%oHQC5Qd}oBO3k4e7{j
z&ZY}(#^>LSlT_aC4VMJ5Ujhw5V(9_)355xCw`2AOhbL}d9W)yaHZ<bmSm%$PGp!i~
ziuTknM<PaVIjdeop#HLU<>i68SFh2b+L(FOtA)6`WL8}R{LN`UZ|3tu=H^vX`1WyR
z0^RiSRRdS@Yh7eCJ-h~>s1q(GumlGxOTH_4IttXyU8OH=?>GPKc&2dM1t!~4)`UOP
zMv+roJN&rcwa=%wZ@`^^pa9+M2ibG|_9Fvrd~6l9r=x!@mUTda#vY8(M0|jN&KB6S
zu;BKi2VobZ8bF@4bcs)k^VG&vuAJ@tnE6(*YDqg$q36MqAz6@z93F6G?HZpSHv9O~
zi0qS8$}#N;EyDgS!9wr;L!^izUbXKXC`!-SRL#bSAW<~+gv+T^XDOQtD8F^8@JzGy
zvK&axz98hEZzNOj>4wU5?I@n6_la%o+kr7U;~~n3dW<U`YV-(&F&M*AZ*9jvR+fud
zMK<0BWhB7zj0By|^)v9A|Hm0_r25#0cX{(ml3FWO8<IJ85Xgh6-wU^2b4sT7g`?3E
z6=e?ih$`&zE@>B8x(r}|qY)Ywmdb9WE^z^|Nz{RPk#PDc`w;EXguR8vkz&OX(FbAR
zmh;I|1T#dg53oOv%Jebr&ZTZ0)NH9n<_MCsHRb?DZL=)HV_0Aih|$xTiRty9_`9{U
zwI-`#EwarYQ3K=%+6A_SKYEs|O*-#B@vdLf`Q9ItgDKWU+VR%#9U5CoUAsmMmm+Gr
z2h}rloaoQgd$z3xw8aIzB?RIAR=Z)A6JWvMlkCtvkvmLg`;3<Yyx7}$x*!o`<Yb~-
z{5KYK?j*G5FDOE=jEsJ8bVyj9Kp8*K{faM(g4jugA6XT<J{k@5ncc4OM$YKA#svSi
z^ntFu8GuMg|8rCZYb~QqG&t?2^uXrejfVtpt=Ru#Cjc|#aPjf+@meE$h2F+fqwAlW
zy#qJE_j^yiJij|$y7Q?c`oEOFqHtHa>FMiZU^cWTx-U+riX@#&KdZ9%exFYIfTa~i
z^kVBtyk)-Gfpg#=6Mf42$?LK7v)wl6C@fWq3!702!+^~Z6uG4$&&R!k!`+e{Yol)>
zB-CI0V;w3h!$ULf^8)R!jVeNpRtj+ikl@KszL7-0U2}SCLxT6se5S)H1vp|B@=^2C
zw`2B68{gKK#wSGl_f;g|>SG3`$-__bnn>S_NTSEZ6?yyE7#YdsNBXmkh%L(^#&;+D
zO!~^K#pYlsXF?D%9IFy*$gH^(CQgPj8bPycEvlyxBxUw3IGVs1MBz-4Dy+?Bn^{y@
zwh`x0a~+qGY7`{cmVkcVX>tm9nN(G4ZgoC*N16kzH_3moIHgaaoSRsi<LE;~vS9($
zcocMVDV=~o_g_ZKUlE|wK`^o8U8nopOLBKvZyH?P{>Lq3KV3H!F){8i(YOTFO{7m3
zeEe2@;o&i`u)W%0_qC_qwU4WwR;7H}tZM$!kA~FG7erZYe)~sS5^H0+RmpG)46B|t
z5^IsKU<gp|XDKN<Pg{3AGIkLlZae(5dB0Q4jiGdl#$uu@)(vW|K10*Vp#6YWgwu*?
zqvhsH$;r^LfH%8OwGoq*Pv=?a)|^I#gLx7cXcFQc@_3lo=hc52iJ*a3`$e(s3S8Gk
zYkh(|<d_EU*b46n@tB`wieb)VuSDu@M?|6`6Mfv=m|>ZSZ{4*ztx{ReV|}pkaRMzQ
z3eKT_qT`iTp-p<M3+=0kT;XO(7roP+8oF98l{5w^5+0}d&FMPRN7ri{7W$sf(jW3<
zXa)kFt%fUixrDE;m{EIz_MmBrVuF8BH9#)#&()vm_-|>2n@aYluncFwndhO$QRcv}
zW{}@LCI24rzqI&dIB?}8`~olOWZHehQE;}3#Kz7HR6oPndXXApQc}R=v6FL<&&U=r
zt-5R$=VT{$l8)Kts?zxVntWQ#*{LY+sL$+L5Oraq`)VLBFhg~Kv4eq;8teYS*nDj}
z1txYzwRCH%H``8<eVS#2W}E=s)~?d5WmVAg5?4Y^Yi|yTQ5<`jD60W$^~Lt-^`3m0
z3G2AhR?>`l!1@NkC8d7N#!z^`Xp!FNfOj0mDfMO*2^=`c!0>^%nr)du2}~a<?icHm
z{w19g3OnBT`h(%9gH<eifJX1+-h0HYQ`7{-g{VeowIty-UK&)K$9SRb{a~&=R!({0
z%E;|~{hAfGhUz!^)5h(yZFZC2!Ce0B0p>KBd5!c8tv5&MQJJjvxft1&(p&E_6Ja9-
zF>LLhu+c%VD@;J)j`2@z|NmcWt}6<4H8no3J9XJeOIuLD7h*n->lV8WTVs}rf`X>J
zydpD&*T_g+Ik~;X-%i-@`&~Z<xbRVGZ7JfjGG5Y#jSW*xO!jWBud8<qbaY}K8>$^N
zR8$P4q_nM19YsWJ0J1_R?+z-OL8TV!8K~VXp7OWv)#pkHB$2mh7f8S&M=6;wct9T(
zhns0+Ln}pDJFKxcSc!=C8Z~K6F+nx7qu%K(Xo&yEDwSx>x@E|!jNQr|eun<#0!NRB
zy<7U3FWz$n{(?j~=N%6J^X1EKq=-*lR52m6K&CUMi~~DC^%VCUk9Ug5#THcYIS9+o
ze5IS)VfHSn0sHpfy#a>zFd>R_OMdNa-tUzSjwD1zYcbf6<aNI>F5@yO+$ege4a=R5
z5Ai^Z`-yoomA%We?CV{!I@st@8_by5N>2sM9?Xj=G;B4qhFPB!n^tqUaSbG(V=U8k
z`bMJ85<d)cA2V}W#^I1k*&|d{i*{oRzEzouU07KT+t*4?5b8z?gqz`DEE70F3?cA`
zREfoo&N)qpf0MsD#{ZCRWZ^cc)YRpjfgqQoyo_vvh5V3-{c$HC-NPq<vR=pKaNL#s
z9NT2Kfr=J?m6k@Mto#iaOl$9mYyanPWne&DW09L`1n&FT7BLKr*XgNgQL&lJ*@~EK
zBnvQzTClUw#YGOdGB9xI5B+t9Db*_1H?_{ry7BQycSp13@e%j?qcQuZom{K^o-*Gf
zm<qp2^I5z{*4D<fq(=4W+qPh`-XHd`8>wr$)Mt2bp>Ovf0kXr4#NCGFS=~jpZ)@SR
z!&rBe_$$ZlAiJCxp^~un*fZ3{N3h;Awp}(`dOn4|cLUKs-Q}73d!71w)y|XcOP=Dd
zJ0@<b#tA{o<p3J_4>u`rJax&Ap+_7ls-_MbG&r_^eY2bKM*&bmt0t)e5+*nhpOWtX
z#d7?of5)>HEeDEa8944Ny#k{@xSrsT);piSHu}1M7I0jNzRo*V8$9S)f&rj}|1XyF
zx9)}mh@>RknJTUP?Qwy<HjV1j$rSIxFhDhq1oRE>ApRr20hKK%eycYaLP~mOB8$J(
zN@I7WzWQP#TS3mp%1+Sf&tWc5Of@-~HC9*u2pew76`|*_h3|B`7?{dcG3XTVcy@IJ
z#`UbNrB7xTM`1JbvfnO^B(`P?_@t*a5#883II#1!UO76>3`X~VQCv@+H{3ouBS_0O
zNc&mU>T!KJ(Pf8rQCeNAFu}7sfn_HfvuLp5y?Yf#fw}%g%dT)RE|=eWLa_C;dfsZ}
z=l8(KzU$!dD|I7|D1+H9>(kabPeslXpC79eg}Q0Dc&DbvH9kG!d|6Y<4-rU=BHNf0
z2LdN$1$cYygtW&B`2Rc~mr71)TrM()pafnE5}jgX=!d*DfQj?ul#%2mOEeA0`lnFJ
z{XcsTOycg9?60@_#SE9X-B)rwF|eBrzSdJW8Vrm@WOjfQ<Rr+SZzXQttKBH<nQi<&
zbk<JJ)m=On(ZMKL2!+=v524b8sU8l?Qw?{RQjXTflT$xm*N`d2k;^=H$Ue`Vj=qn<
zzk2PKD$YFmwt+I8CizD*o88Xc&iOl8M;;E-YE2*#@faA;xHv*ux$O-+4ArI<aMGU2
z{5KCiUF}y~t>jIMj5DsGL296V?0<&Kac@MW2pL6<%-@axd$d(qDxCasb<PFMzvRIL
z=9n=6(|ql&_^$>OsnjSaW`p660F-jG-7CMnP1oPw`^(eru+YKeMu($GYk4_Zx}KHk
zJ^?ZlXwptxO)abKW$5`H38=yW<%IR!X|<A4H5k<0TCG`nW22v%nx6Ly40x#+a!o)$
zqrd;3;rLd8m@9G7Z35JqgSpZZPTSMt*<vjJDjqK+7i>X^wq<;dC^59&h@Gl;tbK7)
z(&RzFJjOj^6OjY?!XbT-Kib_m*KT2`ApVLB9Tx%g42g>o(H#rzk8FF(M#FV4gG`=Q
zgof6=Tcv5;-DV>Ca!=Y`54fYqd26)&K%KRSWL^EH&+jP{hwOCSkQf)wS9^P16L2%<
zoG@`7KJJ-X`*aZ$5B~}y>R<4SD>m?6m!mr;n*lPU*?IG+{ZK*^e!;TtXh|ztBWAq_
zU2Q^>U5qIo8`2~5fy8oLO2!Lq(gY2Hc~Z%La3#jBqa=sogvEy}LtYjf{etoP_PXK!
ztojOK`lyKmgyFapS9e`zNyAGAWIx`_9$>OW-$4m7r%B{7|Ct;a`H&~3-_TIx=-2`+
z91+&s;rq&LPtQOh4qzkOS53fdw{%{&xZl4G#0StsdZnnTi@&=M4p98k!~1H+sVPy+
zNVPUf3v_<(R@h_zG8R0_%4q>XUHW9PcLVQCjY&EZep5o4r?kRvLH{}D@{)Z)u{NFF
z=3*@V9?G@#45qvb#%7b(fG=0JpF;0LW;mzq5a-nwd}xvE?NC$u(8rJ|9xM+WJVcp4
zMM)pPVs&YMBD^R%VP!V2^U8r6>M~|pmgN|dMy0%V_a;mZ82G9Q&tWgahAU?EmAJ>w
zn-2wP=8=!E%2r8ilbAdlO$at%Lbzo}t$Wh{%V)l5zXr$aqDpemesXP=(&1)%%_4H=
z(XZ8yfYstTQ;xeEKHx`ZL&?*6r>jm7HNI+6Xh|Fp0<`^Guz+FaVt^AMT7xZro15LA
zHjj8RvOX=kiOVf88|xP&d`(-6Wng~3)W?tKR*R6hcna!DO7IA`2r$c&dyemq2UGbH
zjBD%3o*vvt_)Dyujn4hB*`+EPiAl=m8-lcL{c>5%MpC+K`SFy!1o(787`d-6J^6Xx
zbajL8PKuHAfhj8TDk>GrmluNR(P$UqkQH^6vU=jTi|h4oC^_z&TVF38Xn0*t_7p8`
zcKi;|-<;>K-Rc7)AG{Bk@HpWpFHqodYaf*uUfSXMB_8MrpT1TQH(PnXziVHLu0=rE
z-O+Kc)c1L<wEO>|p+MR~Qo<DElC1y(Owa%Kc*(m<YnjtP7&>Ojo?7J6Efm?bM!Ij%
z7p#bxeI}z5KH$q-u&PpCeS&!m_<QEGNkINWi<AobJidJT%F6H$tgu00QF72QsY1j<
zWE2!x{9a~@l|iPaAXL<Ensh~OcA1H^J{TB@KO`y1$j+CYI?BpG8-Aw+Qt^Oik%5*L
zuURyw{}c*%e6j{Mej9=2{pm8A=xp%WiVFw(E&pfeva!#|(B&G>$43?-S8qq#ujzm*
zWf<UKH8?ov2AEjA7h8q74FbRQ0++@CaEPkQ44u2a#KHi$wMXEu4X_r;D0VdI&qd)s
zMDjIt-#$RZVK?w!(GdHQ{oVcYgDVQ3tRfw@7*0eXY+;=NQs^J!l^m?;Z;J-;0Jef!
zR2#JX0e}G4^9K8IdXmj^T_U<*RW!%9li~x(BR<qmwM4m)Dz>EqUk}U=Fgk8ELn%(?
zn49KlvP9Rm=hfw+?baw)q@qI{-LijZt8}or+}XTr(okSQ49Nr*bOX5N|M!fV=~+Ie
z3F-sxiFQCJ*7<OyUqhBM&VXEY?T1u?iJ)j<h|k4j1{ZN9Bt%F^$Qsx%X4kJ?p|x7k
zrBvD+7uO3oR$bZlfF`z#mRAoKLjXEaM|}NbY@lTWv9zeqTg1!3^rHoQ+Gev0ML_>I
z`yu55Mh+pPDP6>SmuskYfwXPqZ|l2D<e?~pgXAHy$KQ<FcnPUMHX$2D-PwTo)<3Vn
z3L?6lWFsUnntG|;YSF;&mEAIdJ9*S<BbEVI_vuh6Ejl_aDG4@Y*5_R5U3SWxEq>Vh
z&msBWG1PM2oMp{`!`7_x9wOowc+ByGYY#pFT+&cQ2F)&5%nzH`2OpdKNeQUfw6U$#
zeh%!xHK4wPQ(#{spdwZX(C3q}J7m!&_ge5Ue@DF(eYv_l_?`V?HW<YOp{Avz<j1GS
zR_5{dBqZt%S44PfMD-G{j;Y-i@YoXC**DdAc-zbFR8d84USf#`e}yB`@BWJ1_7(I6
zovVnOPXH?eZ3>z&!uCqzBSSXQ_4f*B?CD-+gB~>XfvxNABd~-YB7V$*kUdsUnM6^c
z@_#msdp?opL5w5Uf3yy@ss3X+f0>5SgXSWE%&MH?FARQr-8~#LxqIC$qlgPRf0xY-
zJfQ!(bjV?iVNyg4$+PERdiLDWgAPI{Q!T4-io}=^H~VrN|14IWc$)+GX$Y3qZ&bm}
zFiki>cj>U*emI!KU|O~UdlUmwS2?c8kj(-~gT=j&(5LqsBM{gy<Wq(9?!eYcf8mK4
z=OD27H7~>eDh1giMk7<sZ5^#ISP_cTTCht#_{@t)WY+)9J8d3G^M2=Hm!Mn8$Xnu;
znH}4BHl}ym`A+^A`fn;g1*$Ms(eAI2Acsc0U0d%e(2M=PN&@2hQFM)90?l7|X`wdu
z%aZ~T;uPJaNHUJ(EA3In98Jq*GMldfPF^K=M@F}gx)9h(nE1QZZjmG`<Pb{XRm$LA
zlr`bZTC_*c`vWvm!EG#wMc7Wb`LD(KvHiVEh+=1f^vPagDn9Wl_T)h@L_tkp+MpO=
z()2F$lEBP^Tm4v`IO3(Lvka+3knN<I<HHvfdvi8*u8D+5vD<jkkxk~WHvAFKb)Wu(
zG6wa+E1{H|I>e3QV566fA0q&-rz@}-E$qPc?>~?jhZ95soijc3Ty2&3P0mH3BHj!@
zFS4XZy`Cd(;^I1zk^TDTT5Y#(BLp%%-bD<FNe~oin{`ew4pZy6IV`g@-a1;c;7$C7
zuF@*9$Z2@nzK)$i!<gF*Zcf|axs6Hi-#uSNVNZCPCrtN3oB<=}3zq|VuzKIm7){!C
zRM~SRCdFg#n|6@f6DdV<`9|Ly;^e2Wb(`=feW!Ofz~}N#Jy3SDv;j-*&2IGoIFr-M
zyrrZ-w2=n3C+_ShAxJ^mK~wd5iE!YWfD)vN>2{%**wt~JuCGofL)W{b^rgx`c8bz)
zipUD5!J!$QM<e*V6mD@b=?}p~trL&mil0I(IUL0{Cv*u7KE*I(tiVw@ZSHKBG_sSO
zzW*j%;_Fsn{fa?2-npM+Qz#6Xsdv=|Pl>8u#gFuH#D+{HC5IMa?n&8}zKJQH#Y4qw
zBm*}WCEGyoKxtS?XWX#t8>PU)0|h!t`g(z_Cj(Jk!04XgRA}*!U*Xtyj)1k4HU;wO
zy81}G=MbBV$9hC~qLfMdUf8g0i#6dl+;}}<BNV2ThBGT5^K7`0YSU^fJ_jE2bs$UI
z$eZt;N8Zx3^`#Y$&3oflf0-3A&Nj#sc<<+PzpQIX<8l49b6{^}j$2_knl!7BRWLdt
z4FMrnQ^geg*Nrm*Si*0R4pMRNQzqK&^F0cT(YnsNPs4uEloutHWeAxqa>Cm9x^E0C
zXy(S2OyXWop)ev~oN@{su`NsVpRf1R3RW!?D)~Dv_cs%b8XG1Bo9;m^v`}^(3gau@
zCK!+rLPm;)S*z>_s%AlQdsWSwxE}(978f}x(@{!u7+Q_py0kYW$>oB&5CB8hTz5{W
za5vgVscnA7)hYcv`k1uEY3?o4k#bpdBc~H@)No79wOiAX(I|}WGB()tJyU%tQZsRd
zBT^3B$J<|94$Y4UvN@%jvY&riBoGtR;09ejP|-owPpc{gMXsl)-}(!K$<w>fq$71y
zJ@OYqJ4G7fSidzSCobyMc}byEe5+Nag5R7By<E1l#~IkXOosnyz^Uo+o<7nQ_zfSW
zT)t<`c}5*$aCu`$hQ=ntitHkf+CiZFTvZ(bH|VW^gsS7P*?|o<NG#Vvw=$UPt-2f$
zC8|1d{7G>fKc5!oBu8)fYe->2`A>V2fCQRD>(!+UI-2L7^<~LFP9p<*jBx6e$$&jJ
zkQcS|(6od|9`@mx{t(Vtu=@8Mtdno+Zba|M&5ip)ylEc?-SYuC%s3arY1x_sQ34gj
zgmrLLf{%z5lPrWhUDr#7S)SCt9XHI<b1kMlVaSpS?(g6Hs7X%VytG8n;|PA=!f7Yq
z{Pt3zRl7Ek*(#E&+UbM$x_s9PxbO&m1agaaOglE*bx9leIvAx{rMdI`tEb@*YPfaa
zKF1gKiBQ<J`H?t0*DLNAigSIM(DO?=8B<chieen1%d+eA1SR3zGw@BwPU0{-XwF2T
zw+@R?PT<v&+cOpxxSlolZz>wsHd*;JSly}K>z#G)e7F+tBa?&@j+VHHcFW$O`%U9^
zdpVwKMEBBDEDYEptOF+V6}!cujkS{*FCEd_HmG)xX3f<-o=%B)5VvR-=rnOvNV+b&
z(AcPuuVrbrMA;gX`~0r+tm?!Sk_@Ytd2KOt7HL1~!m*QjC6sB+i&?E8@_#?RkVtQH
zSni_Y9xR2>Fc7ipYI$d56<|ZE4eo1Eg{4x3p82V`2sGY>XL2O=B1kcCsuRaK95(SC
zpY=eJJ6Ko6X4DoXbg&7uhZDx@G`^aUGo9TABr5p)x@s{fp$fjaAXORN&V11XlNW${
zrj|rwiW$Xwl0-@DuQFk?-PMsMjSv*qaZKx|i%$X%8SGS7>^veb{LOY08s%wiTd~@2
zjev0#8TP~%Xor^koEO<JrD<4RcwA;?f-*T$*zaJ|TK0V#!Gd>-dM~$h4>~@bZMsWW
zt4(edSTbji9CyD6^-#{>{=CWT&e!pDI>MNPtt~E_wEWr|1E+o2KbP2mkm$Up=UOh^
zpN~%+7x%csJyuCs83%nA1(^v)$Q_T95MXgJ#|*M_UNqXQ)-2bG(9)E-^9jHPeR4h9
z!%#QaKt|oh!`x?VBag1Eti(VgMGmu+Ne(V5^l9?(`d#&LefI_M>i`_!(jV-uAO}$(
zLsI#zsk1^eTe~IX6o3I`qPa>~_XC2RH_CDf<U7kDHdnb_+3R&ay56Uz!JcbTzHN>(
zMMlG`+c6uhK6l+^M%~ArG|G;s=5!Q}Lg<AhJi5dnDuxie^LBy+CSGsc<X#a)Xr)Md
zhmHD>Y6WgO4#{EP_>1bVbX3UU_ZM^^Yr>0Jl^%2Oh~rQg^<3cXkVn6O0NZWUD$!>w
zacfN_;*hlHC+;tCn@7YohQVKX__pwYTRAd7-IZXLh69i7w4}b}Gt5nf@FMbHY3vmq
z@MD3}vBg?DN7;|yYlTq@4h<(CFDX0hUvooS#fs)4YD3EIb?*V&0d_Q4h{}MNP}s=n
z2tNz{Bb59B9iiK0p)MUHx1h3~M%<9Z!@RTnH*TNh*%}qnx#wZDACDW11w6-+dTw)-
zc}{;OeY=d5O!V8kmn)<DQF`+ndKiNcN)s?9W$rag$2bxO^B~H*3fJqr#O$mNVT}Il
zhAkatX!ElemXt01h%EC6E94VEwI=HvS2o#+2;WnX&UJTl)oFn!MaO*@-25DWTY%nQ
zWs}zSH{^^0+on?|+!prsmMbOoduDbg8lSt}R8Cb~{LfaehI$20mEDH2!A8B`I{}AD
zxKXyixu>UPb@eKz4Nu3nevrSv6*V=^Y*CQ0seZ+;UoQ43&o7T*Q&S?1w)|nypwPh`
zD1>89=Mu_PI^Wx4SA4|hW(Hj`vHlVzC;iR|&j%)8O7`_l=FK_aO`rf2)OqYiZA7|K
zEA@wxog~4^GsZRe%S$8nsn1UcSQ^-A_ZgflqUFg0MJdP{2#gL=4m$TFi}GuD#_ND^
z0jIEekG9x4N$Dpb_k>{cTKRTQx>X!eGtxLvAqvwAMg${B<n*wA%B~`z(71CYC?<pz
zC?uZGjXyCGmd@r?3T2ZD1YkGJi^O3QEmaElo2O9}4Q$$1_a7S&z;ksZdUba~?`Gem
zZSDJ-R}hCM&okp5IUN+kiC=$94wF>Polc){Hczh3&lusZbQKeEZqI(b;4Cevd?bkw
z#_Ea-kLdSj{N>>Bmf6<RmPh%9Fj;q~)N8japHT|muXXcUN4lwgpiC${B0Is>N4aKP
z-bl!=AE|y!&SI}0A>(zyvL!C?yF`D;hFX+za@&4^-ddMyYhtp>r0Wz}Hes$uXDH8+
zz7lz6tzp$Qi%}31c~tN3{UxiPc04Ec?rmpBiJY*dJI>?>w`xu5iWiRT5?cOM>(c%C
zA<m+f{jvA+2>2jPo3KJK^!gz#%oxVtUUasq;K6-#PFTVfm}AP$bzzg6(eT2UzZv2=
z_M2cx@mE_EXFT7hQ<iPHTqj#BB^YeBfKjIz;!#wG%=OPh7oYxx&%_t!=kG5rw)XZ=
z(b<NlnI#PjZpKo_H|v;8Wj8Z<Cpm0jVPUU#9gip|^qmgm0XGaKy-Z3<v-F?#heNST
zX}FfEbK&2=N0E~!?C#PE2*A<MSXS!sLUg>Mh>i^p>j()wJzT9rE8jHp$QqH}zC7}V
zh5ZJU*gx_{M@L=K4S|&7pSMn3jD65Zpl!_Ob(i2rAS(`tDAZx$gbz)Jw5GsaUfkEg
z^%2^wTq^j7&$!p|-GZWGGLgBFDDif8=gYmAMGUCsFw!?E*z>jCQ6Fmx&%9mM%OA*C
zaK$QYyj^L<q*Oe*ssya}i>LPd4<-Gv5nt{tmbc@+r9qb2^HgsD@tV=3NR_E8Y+XLU
zdsVFQA7nzBN3Kn4t(3n&VWq5Cb;=Hd{TFsg)xC?_R6q~stY>-HR*&+Cn|A5`0_!+~
zS$U{ocGNuS_EfW~I%#^0M7}fGrT?U|_qh`+zlnDYh@}t3fx9s3w6EIJ9G&G!{B0I?
zB^3qtyk(~29X)HJto6LWq@05Jz;L>wP8uF1p1C^Y2RtyFg}1cHq5DgaO%hGa2yRHf
zaRAzcC2Ju&#mHw`aYLo){JJ%lLU4s+0qXT{6%8ew!pd5`mrZ<_@vYs`7+jB6_w2J5
zn1@5i`4SdsR6p13_6$7rj#wqHN}|NU((xWjc8%z7hV0ozv}VH13V~4CEoIgJ<*~kq
z#!6*WgECR*>io3fONU4@%0&oy)IzmZ9Jn=qKs=9-UyktVK#j=TSKoT~#aZld0V{IJ
zPgfmnb54;K*%RjuR*zZkMrGyY<6~{y_ze|jUnz@8&d>9^3VrqU@9HdQ0b|40*Y!qg
zt${5?Ik|Q)u))bR_WYbpf~vv67$hPtdv1#Tj)+l3D=Rc2BD2p*y`!{MWw4?5|KPGP
zUS5nC7>}LrklT7u&>bu8QBm+LT9`t8C&$+Xg*K>YF`C+GZ@j0EMDYI;x)N{PcP=%%
z@z<dZa=N_*AVWTu!FM!ObHT$#QOJZvEv9q3HsniG;bji&)->~Np||+gW21+MY+pC&
zG9X1@jMetO@$sj8O|qjCXX@o_4KnDca;M84JaA1v0{h`E*qnvBGT-NQn0R`S=uAjd
zt$`KeG<WnJaPDqI1qjAq_ZEZUZlvkF%9-9P>hTjOvKtV%U!B<R#iWc{LOMFwa!X?K
z8O1LB6rGv!EyeAcdlxLxDa>Y_U;`!xbCFRA*N3(;C(9P&y2zW3?VF*0UhcxKEXStB
zdr6w9&RV7N$zWUoIat<xgsS9HIs=d8cGEQH^>*!vgn7IXL0uK}$l><j7|nv1#Q0Uq
z%ri)JT#7EK+YoWJBxW{N`?}6q23Ab}liRq^tgO`9p66SA%zU!0OLy5V<A{g~6&ao5
zxRO~(xckG+^;7{Wo=qv21U(OyLu+bsipqOqZG)RaYD&h6gBvUg&p|=$9h(o%6n60g
zA2;gV_j=r)kDGlnnMWAP1stHHfNg{$&6c0;oSK8Xl0TVL0fj<dcbBVN6!atMquBQA
zsZsapHtc6iomEK#?`s~IkbLH4-~2H;gt{*O4DjK-M$XHNFiu!qBQJM4pI4rnmHJFg
zy1V{}`=lfv8|zOLay5+&9E3d1`?n{t(a|E1@F{%W4>_5<q?mM?KD(0EQo|L8hrB{U
z={wu5X{k7%_UE<b<&jn11mV&Smy<flHUfNaORG|5=5#ex^dOfjAMXWVs(mz$k<a6`
z;PbtKq=I01X(<W(>H+~^J`ETd%SGaH(-a?zSg%}ZyRP3~bBr@ylFx8aH@>mzzKY9b
zvPKV>zVIsCSa_}|nz{viCnf6>)Z+8Bv9Krw7-uS;9`wi;MhTwUe|o4f`RjRFQw;zQ
zr$&?Uo9imSaAf>Aw!-9Q<<(D&VdmNHv)pSR9SfXD*`3K?HcjSQ%N`X*%w^AgGzjq6
zVSZkhg&U$Js|@ZUDwB~n-{wp$H!v?>oTLi08)LyMHYKPu@-&Yr$y~S8$}8iHzj@jk
z;PjNDq!AsbP-C#c-~%olALo)sHWBB4KA6?nn)1(XY1{fDcx@3sR$!J*(wgts9$?K^
zqYWHsm;ZEYD+&}~OztuI&1;D>9NTt*l{r*WH=PhkSV%3?rL8;cslJfHA>{YHZRrrN
zdGWdwoQ}e(rUMtpjwl-#Dc7x~@fjpl{Cy-XR?(R;#`9YO(AZAJLVp5YZXS;NgbH;#
zoE{LTniCQF?N3rZmwD2!Q<aU^Xwtakm%tG0-NqugFt_~ML~Cl)hUf_oW}yx2aL!%U
zPRX`e$M69g0l7j}W^DB2QHgvdJ^><9==G?|VpjF&Mg)h(v(rky5cSBbfrZ7zvFuI}
zuF>Q^13x(><v2J5e*3K+aWS!|+mm}f&p)V5jisfOQ3I-qKlkmM3=M0~PJ^%9|4dB<
zFf+qw(s{0NrO%gR1qInqGZcx6*@*sx1n=Hxx|;lRWGE!0A}s73)IDD{JZwR$K}bbK
z1*GMbT04Tk==pQDM8IS4Z$8yCK2LvV`75w(DP~P);4AZ-G;UKLrA(Z=u72z2(L~Oa
zxi(nlq;LoFMP3<CYgI2i;=whlf)T;|gzbJZU>RlOp(p02Ef2N`Z>t^WNM_9|4IGl%
zW<CeH<KWr}VhCoM2;L`gP&_Pd`l#P&nH2z<;Oz4;fZiLl;RAY%rX+&Pqvm62a;wX5
z);ekmATbx;m98F7TcS**;h?DWhblAOY5khRW;|K<cCQCC)MW8-EjBODF*yhJ78r?2
zkPVAoN&+qY+P_b1;^XI@FY4Hy$c!8(IP#=J;cn(ocW+;F6sx#)lzB2~H^#ts->U1c
z4e|&Y1qJlZ3CyvaR_;-wFn@Wg#-960s};8A%<FQsI?>{c*tpmn1EG3Hg%D;XudT_(
zlb^Qb%G+5?w$&8XLI<NWx$RFt6Fj)9l4@j!S)j&DaU_*lJL|2x0R05V6VYGCJVgxP
z!h?XH_srg-OH7*AMYzE8Ltz0nO4O6{1#}Z<dM^egYL+eZoa)c>e@=q}r8wpai@ZA;
zJl6-Q2UOj@h##UHtutd|UfurGSy?-QsX{y=TxcmNRhmDuiQ(fZC}gc4bm>q~WZ_0E
z)!7Nj9J%H->9jOa+H2huB!K(-x7L!Bl;rZ-r%N9f%3VQ5zO`b~WXq-uy_i`hIPS4;
z#_#P2_Itg>B8;(<ZcDtIcG=koPNON+*>`aT(*fKTiDaMq(noi;rVrUE!oe^kfs1Zf
zia`~quyfg{QwASLiNRLoO@zKPMP<+5yL+uBb=WKpOL0wR|5W<T9<jZ=S#6G3<jZe%
zrB7Mj9OdU<#cwUq`kBxQX&^7Pgm%5PJ{KkRU$q%Pi>@o_ZM&kltkS|FM(J}a8a9<Z
zJh6?8`$Jx?+m?ONHvL5vB&Ii3vGLq+93D^vRN5Q8c_not7Cw9XcZs~2YOM!yiBYiJ
zW1=L#f7BjpX~9ID&8wJdBbD;ecsV$FfThz0i#^`QIw>ZX7}sEtC+^KViI+q?-zFte
z8g@a5Att&!_}tby^56#PI!rmvB*1&>IuhrH2lm!I8p8rxSPk@1oIQLq-8;wVs}UkT
z0ZkIA4})+Ky_qNG&9-GeX7_(}i~4m`DWl^r3Rjpvp2Q)i9EwUyCBAg^;jnslgo}Wp
zi`B%O+}xsOCnmupa(Ag|qW_w2;?0n|KUIHS=!m!or@@RQY*=0`txcc(q}cZUb>;t%
z^_5Xoc5S!PNJ)cqcSv_hgHjR#64Kq>At5a(U4kIeDV>5K-L-*DH=AzG@_oNE#`(r^
z=%44$1M6PviaF;sFGfhnH5OBsmRLOJb#zw{QHaq+6BQgz@8@Sjf@_!`6Oy9yngC%y
zpOvJ-1pmH{s+;D8-5+@_?+^M~pY=x;%y~!Z0@0L6yn8f()1qdzRbBt`{h6*i)C*${
z6`uiPfaw~A&d5nyo|+Boh2LnQ6sejYLqx}K8S71|zD-qSf%d$F&sLuzaO>75`GWGY
z^DsMe!>HLLS(s$zP{g}&c!`q-h#q+V5Y`dm7`UH&@`qcl*Pvc@ni%<MK5sNyX5sSO
zzHK@79|whk7Au`!D}-9WsYvxvT|qxUECRaBU0+`x8ygFJ-?En-2d-{Xx~8wupW54x
z<#5wK5oO;7yW0*Z8yFmw7cwo*KRk+yc*-H8jCQz~lScGduV8X@AzX?+o^H0t$JT(A
z>&CB|71`=?F$klX*~>F{ENTC-1_>+a$iinP6Sl_7Iyr6Zm8Fe5M=xu8m9rwBV2Odk
z|8vTbV<nmGyGS!pk2FZu8CMF#pI}2;Z_|S}Nzm4#_7!pWbU%<}GRk+GsNWCFYEh4}
zVh6KF2Y3=C|0WVeVf15WYb(W00IiQ0<~P+joBM}$3%S={1dnQk(YN_F8a3S~mB0NN
zL89PfYP^xf@S>LP%%RA)C}-h1bm-dssrQ_$`E9MRKRoEQ5A6aJ=d)Bw&0Uf{T)atH
zAUq_E@<hicB5G-Av9z#|lao`)zD<%glMWE>;QevaQh;`MLW_w(NEnKYqKuGTZ}-!E
z$E;=K$gOOy&DT>+P0g|q{xlGv`>^o)k#C#$`jx^8MO<{fk&u~CZ56V7y0=FMa`m7(
zmH6rreKi`wNyb${8!VuMkJpp07V>@4`v&u%zlhg~`jhQNthribBC&&dpto>DBYigb
zb)62K)>Dn0Z-f(@?6(GA43T&i>x827zPVEh|A$4{Y7F*huwqj|k}&?}dp!x3mX>O3
zYrkROp`)Xtp(*IOeQ#huj)ylnH)mySZEa#Q@$tgLJ>bI4r2UQ~$2XZnt`klyAqoG<
za+nSCX2svF5p}$9i&u{(dU`|za)=Q2`W5loxvZVanM}fKs#Rw}Ho-$6V&~6X)EbeH
zK&6Nu9^v5T&cNscN9amMP7ZdB->i6VY-~D0n&3Kcgv0%mFa=o%iUu0we70^g242Lr
zKh=YL7P{*Oll#<|EaWF1G4%8lYlR<zC*BV6>Dh!K#)pAe$EK7w2voB<nG-8aFL}d0
z$xb$jc1a%}#wEl3<5FgdQY*dHyDVTi&HFFHp%3r8&UeP|u8x<NmhkcMg@4BD>gwL!
z-WpT0v9a;;*7fWi?%mtK0(3MT8q7E;BJz23ZAA}=TCnyHC>nt^0Xh-#nU`Z&t;d6B
z@A_6@z4|br%;%EPz%Qv=tk9>sBIEq__F|yPlYjAGkw1h~n?_GZ$3Wkfn2Zk$7eCe1
zAcwe((91@IvT}^*=y-w;)7K)zgg-dEFa<Heo})IZ?8{TfblZw*V=UzS@OAYFG9Mdu
zZwSu8f9)*O-@8E`H4p_RCw`Up`LdMVh?oooC+8^O=!LG|<M%Ge4ka_FdA=R`;X3|f
zmzf|E2vw>|%F5oNIayg`pXo}#pGc*mkhG|%sFakH>FH@fG8Nz$AxRa<Us_hi%F5bf
zQtjj81EPTOOZ<Xw-{`;^;lXE0L&L-7wexui4C$PloG7x2N=iRUOSQDL!~!4v4(6&c
zkWITnkh;3MjP;qBOGh9ugCHYz$SP_&5rOaO%%q_W+N}1i74v|bN4&oBK3GuO?X^{)
zKRGvw{`P2?2os810ex=zb^TNIU<U+^b}5Q+>C?d)SK1rG=%|G;HVRTML9d6;crb14
z2VlZ{cB(X*9Zk&Lt3gcr5RW3q_XHFvTS4gK2(0iHs=1M4dT6KdU7$mxk5FlLyE$#D
zHWg{9PFO)d`})(8!0n{mvQnyj7o+b^pdhuY=+zt(zFueywef;rd0oPaeCw0nKPFT~
z0OuHa|0~&Z+;FAsZz{aR*Q|Y{?*$TUWCUeAj+TBIcMS~e3}*_&#l-~&2iIDTaxgQ)
zea^|rQCC-|xq|$1WmGel5tROWx;d1TnAllURI~}q13QhXKB%kb|NMz@*-%+|yw(#T
zg%;8CQUYwADboS7FJO8NSd@R|%n?Fd<j|`tsrB?DTGfdbSk*Etz4(Cu7ni#K-7VO7
z>n&g$`k?EM22D=#N=lLoKN6okdwNook<#oQb^Mu=^BXKlM1Ws%clU);2J5^txlI2C
zBM{wP&Bn+2cbWnD()Y`Q(ibY^*4C9+<jsH0Y8X`NT%CxW@2`>gGvDdy{V{&Eykxhu
zL>Qv#<<;~A6c3;H;{qS=HvtA%VYtliw!=hqa&*Mw>Z<(i9VabndF|CZl#^Vjn9q)i
z5ffxtvz3-;7$bu}4wC3)%Jua}vrme;A>hS~cn%F8%WN2~XGja!8UnxipiH3xI{YBO
z;Ma!rOSotir}39#713Hu5ONeytu5{{K$oq^GwEeJXSf7Q@y=0L>dJHNLtf{f?@oTN
zsW~0^Ld#?E_Zjod46r~Er~9Jth5?W9TfWpQBiQZ5ey~Mom~;dd95o8Ml9G~}@bcK$
z$>va+vDCYFcnb9m4Ib|9?qG^FH~<Je0{i30BTp!r;^Rj@<1uDsWkI1(;H=^YKDADz
zVX!AyDI_l|YrZ>C)cIS)x8D;^7^ps}EgeSc_@>oU_YaO5f&rkvyV^BL%1<Wd!N$gM
z)FtusP0Y*5Y366-3(C`xoaJYt^Fu>iyi=-1fmw-(OFy)Du8v{@&KT|<?xNGv!)WD^
z?dEZNEzbDug&(1Pf6{qo!IF5lKe(J=4(U0b+}!*yoQeCn)z|d&FKVsyV8s1l<Im94
z)IO?4G*xelC*|K9a@ffl4j-TSTI{(*repNg+FZv{_~H4><$3l4DxlXv0kL6u>XkTV
zM_+;Z1ft!4kKfK;w_30ozpFr_fB!{t1oXidEX$toXb$tz856pSDv`m+=-L#rN-&RX
zGSZoC?9F_`fFOp2ids@u#v>pQ`1B~I5FZcBR^LL!0`9$5M?vciPG2(f*M|y)ktb=;
zJd-&rHNk1PxVXf`#8~kzgkzF|#}JN6>GL!nYVs&4EZ5pXvx4Wy8`|}X3+Cod-tJ)V
zHC}afe^!^7nXz<A4jrA6A<5qCo9Et?Fc(*AHH=rHqZ$tGQC9XP35m+*&+>(Z$7W_5
zkamB!{<snVPnucw-`rKQGBLII-zA@5@z0Nszf)JgyoMS01*Dwtu<agEjEzaQc%Six
zXwsrS9xi&u#RV=m*SB~&K8(iyeCqO3Zlz=w^dkU^-~Us!Q2x^y_zIh^C25cmrieB~
zaN!@X0Ol*9VUj(h%$;rp_xL#;1Gpv8r%hngzRgEVS(%lOPfJCmLa|j@Ss4UdoO~Wq
z5|Vb`D~E+TYev<apXKF;V559sw5NrIMMuCx5IWzoLPTA-5hKJ7>+0z2ywSVQT+7{j
zdV0dlSxe|8ra6-DJ6hR!4(z#mUubM;NeM6CMfW<F|Lsm`DKnccw5WU;LF((sh-;;h
zs;IZiVV!l-+<^@bk7MAIbFJl_rsf7SbF7krAW$IpiFoGd>0U4<^=<!6e`OX~n3iVb
z?r#4deWMiia}*RXKFp|cK-i9$pi{o!0)w`B9y_29XL;SYEG5zd6RyC}p6`ng7=+@=
ziu&IPfyZqC2hIpF%2Ve*h0~{^<Vr<m`LCi~ZYG0jt$|-1KQ~51^r!P!%6R{dd5K3v
z^u3_q^mJ1hFbfPY2P-RVA|j$qE&%~m6%}NT2WTHUOUkVQ?C~0A`t<k^>{Y4X=m56b
zXSb0yV|2W3&24}1I$w~u(tsqZ*$}f4c3aSrVTJsr5+oyxPr_%!JY2D@DKA=l^^OK^
zWnN@8(@(WkeO21Zl!r95D@(W@6mCD-+tsC{d`7ZjHCftpILikHTt{;$XJ@o|_;a9Z
zYgJ}{3TiD?Rnl=2b16OVuj{2FIE>m`-8+MFz^;<0C>980h=tV_VA|Avi66Ql5d9BX
zAldM%zj<ezV{Pr2mA&ma_sa6#OP?fG$miV8-MvUT?UEL?{$Hqk_TOgk-!lvQuTsL3
z{$JJ^X>UyNg=o+6{DP@0AZ#pn!~nfiu;b^ejX%J^S&Vg%$LD<K`q~R*UMdIx;NL=r
z&A4ILstgu3HW`VDCZ?vYAfPQQEPx9L2QzGPTAL}?mwoBy?;lSghSqn7a`OxN*8W?i
zkOwo%k%PZ3{VRJ+d#xmhSh5*C0UiDRPRw9Nv*!)H^~-eBq_Bh&)1N32w{M8&YH4W(
z8$tm-!T|n&E%BO4O0N|aM}Au*{QC8!?O{p4afFoQvbyTznaRn<pB3P^S6Wd)Dl~C0
zn1UGi^c)ALYjd;D!z0+;J@)oC!^Q@!Kfdzn<h_QPTE<te=TdvvGHjxv+`Q+j8GMgr
zI-`$%(wTMUL-{GJS77dOdWQemeq8Kwabe-KU3;|zF%u*g9NNFf#{n`w*lym!`kz7%
z(DrEALu=H?6JEH;)5W?70A}rskM#WTAuBo>orbK+{^LiuH^7bN7w|B1u(O*QABUs&
z_3_CObcQ!$;^4^Pwbn2(F-h_QE)c*9Ma}cN#5?o)Gt_^3zN~{LzU}TSC-qwj0l%us
z+P6kx?AA5!&%Rv7VpynqZ{FOgzUYN8vlk5vDwdy+5b;8AB5X&0Qg<ZJ00IMiu6Cak
zMLH<Rz}qk{e5LHTIFb-!5|Wa>J_0eHlFSSru*i{;|L=Q6#RgADa(ty%uEf_09o|XP
zW0J?Sl@TE!F0!)B(S7PVIylz+5k1~Fr-iMpbqx)AwLiE(S~6x85&{$M7YcMP0>sM?
z){N3HLZB27`u8eP|DRWB8Vp^Am!N^r;ey#@e=72Uvrv>5l4t{42M1uLJT_L!LX_f#
zHqPf>P$r(AyCp#AAfP2=VPU~Q?wOh4r=ycGGBT2uF0A#GBU-imd=<;c*&k;2aY_S)
z<Z7N`!(J@!DsY>#vO>RZySLQ#IcgGtsA=~lRpkf-&u9W{cH!y{ydP~W9?$hBW;53P
z87>vt3Xiai0B=F95b8wGVO{PN%z7U|II_OXt4BI*x99jx*W{f6+1i2wpr!s{WB;^$
zMDT#w1HVWLAlP@`1Cf!WtZZ0Cg@dW-dN2ZtjIfy3$*SdOPAs{|C8)j%L<{9&n_ai#
zVq*S+&z_u|oYA}3*N61g<DGgyKmf`M3Ve1@nETzH@4hX&y*y-PXMX@OXS{Z&!1ssi
z_Q-OJXC)KDdoaR!aF9KYbj=L)m$)e3pLmP;W{Zw=Xt_%qjXSY?9*s<NailMdBCdG+
zC2Q;Q?9E%;;s=i4onS9cJ^a6$7HEAf{;8}9CtRlR{(eip(KsG<4_OdH3e`>t2;`;y
zM@I4A`&@Hl-Z#;!ff(u|uCgM71GsU_{~}i{NPByGV3H~=H@CxP`o|Yq`H<v<galBe
zGctacjVAnjDkCEUdO0f_8;6y4@@v;5ui2@oEQvsI4i1$!Z~o5C&O#vV-hUb$SHykJ
znL@jWxlBGW)#!m<9i*bc6xQ+XAvLTBR~S?_;k}ktz2A)s3QEKX<fTCnCDHMo@ZP`;
z4v1`wEeqLtVnm_Vi_+^YoJ-fXv(8m-?`JkIL+?B<AFn0;iweX?FOFE8E%<*c94}bG
zK}71TmA&gU;VLS^4}8M>ubwFKYE8@4-=C3|9c(4;dSz?#d21a+!2pzh!ZGQW(`D}l
zBdy%BLW{rnEz(f@(?dA}BjSDzPS7;%66bk6k!AF@R9E9Ld0Gm5_l2!P#aw%WX}KDG
z_ia}e>-A(S#LhkqFFL!8{vno@4z-LBh<xC89`QVekmKV|&6fw=gQN~1xll&zdIIxv
zUM-<4LW?QseDx1l#{q+7U%wIy2sDm3ux(tK^wrk<6lQ1&+UJzl6H=@GFAZ4o#(^&>
z`gOi$Z<XL1-<+U3I*6PKRGR}NJ1##&Wn&$D*oY&9bOWi6^t5c38)`7R(f;G(>Npc&
zyd#@3c0cG!WS<HYTle?(L5nEdBD4%3Cr>lKg11I(0Skld4mr2fo@Q!-@%<8wmz?sh
z>zDQUt;I7R*TU0|T^kX-*)t9%;&2T&@K+Q^$sETUBWmtP7kCQfhe=5&YUjzux^lG+
zJf-rn>#)_1J0{h=L<u8$f+w+}`?j0}4K~it3WE$SEr~Anw?Xg(9XTqPh9s8KQbEq@
zZ8t=ZBXXLu*uV!+kzed?(cfe=<S}Fs%u{=!WvmjMC{J5BBfFAfBsUQ>pxsz%HKhHs
zb?Jw-FCkqg89>Yyr_8)&7_}r2D2(qoP+2?_d9_UQWBKOIr%$pwS>GppFtu;nj#!SD
ze(?wj60>RQ=nM^j*3h+?XobLWJ*)*nl*UOWX1ArlV_(F%k}d12;vnMkG3Fs*E>=0?
zyQQUNuBdlnpNWv5AO<P_o8;@A@j_t5`^*&45lm&gKD(EP9*@S%%~pON30?j;kqEUv
z1e2;b`RZoy0XenPZ$x->U3`(l{o$~W!_Y&$TVp_8dvr@b@lJm_meH)O-oJNrym)d$
zF0B6XL;LO9+IhQ=R0!fcXAhw$oC9WD`uaSeWC@HgybJ9jAR;O$6Z!A+&6|<e89htC
z8I~WOpRT{|GF>a&N~nD4Lx1N@pWQEiFx|IKCSszb40@a2i}AX&45Mt7rnFHk3Mh|O
zkhfBW*vJT_11pVObe*m)mW{<?K+bHO$ZGXxlNULDdtO>jmTa+jOvOO1l`@rA&*)Do
z&By(nEF!$~Sb~6%)O#5a8bF`l6%rf_N3xtlDM)=-{L8B)<5Hab049r^IUSK>Ne~k4
zi^N>hR^0{VlJ@=K@AE4Dj|r(a``};Z)<({ynQzA(QJYQ&hlj^29h|JJnTd%s8JHt~
z|FUs%j{f<x3j%~7S%r7NDlUS5YWonYPaUV9!jqx>VP8ZdTEt1AmZos#ka}8Dj<NNI
zrDbtB(;cmE@wED|b2mnT;6_2-=FaGi?2tk&_9hk2{+V{c-vXqVD#V;uVWQ@PqQ<ik
zSmn~#3)^}MzL)ppBtA|%W5I72G9^2KFQxW}4?lhHQ(qKj1cKP_YV5?P&;q3CDAJ`r
zggjtM=H>*1u}3~eUpkb%=J`A2)rc`>rUO1%OZ1F5DEor2(e=~EkPy}i|IrCg41-N;
z!Q(sTKOM~^9fTO)Ca7ZXcPW+|f6J~ZJG)#~{gaD|E+1j&*6FnyAt_j!+`w}|Y7Wsi
z;Np77zF6X8LE}DFIi!ll>0tWR)a2C}1P4m{B!;)bo<~bH=0p7U3ub0!=LhpVG+(aW
zl|^YUbDF()>~cIyDn-1~6AiL*^kaGgBs58}LPyvu=RNnCL;IxiCF8v?F;Zg$xdyxk
z+%l#1Tr>_BdCKt|tFNyYi8vuCiHRp?u9K7RfDK4q$CIGJvf<NxQo@%nBCcCs!E&jq
z{qK~dcR_t7$SA6Ouf|Zm$E9%^kqhGkEQjGuRc&LVnD;4e<}iR}j9<NXaOg2GAOeQ?
zk&&IcwZGBr$<mUNY(H|X4}8%gBz%xG-9R+mRW~?JEXW3+(?4kCA6`t#1i>CqMG^dB
z6_;Ougnx|1KvOSzFN>53gpgf%<?+S<(RB09=*^7maq$)vCG4@iEG3Ld+CW48z?c2T
zTQoo{JD-hx&BEbR@w9wS>;<{Kt5Y>4@?>`*+STz&V`C!<)(sHrlM6Zp{V@aZi9G)P
zqpixa&&Ds8%rmF6reS3@rt_L40siccP<os%+ChDJeb28)se8j7pr<7m7BA?9B7%{n
zMzn>|xD`k{UVt%>ZzUy@`!nQ7;$&cU7ziSKuP#S3>N!MOz`nr?P>jC&E?ln5qx_#{
z{DVT|XuCgAP?V+F2gH|haapw=d&saNTI?79BqvYuSXQ*TZtp_-a-dL?qa)gb!)+?m
z+nsT(t*wVJ=vz>1?^hUD0@T}>dmagio`Yk?c6P+U!}ehmj3F@o-@Y3hqa+m#825R#
zuQ4=>+M9Xh79tKZdt)eOuP-h3u^kNs3SHdQv^+_uoIebY;d+i|J`0&aKm1b}$6up;
zKA%mDE60=$x7`VVJ4L4g)n9{gSBT@L-^uCni7qSh<^Bbm4f#n_)tTpv(XD)qJCyy}
z`q_;Rj})zx(H`kbL^6jg@0GUN-wKiS6(ysfBN_)~U2pH?wEJAM`&xTDxM3?2fdS!4
zMS6N8&wT8xN_@B8eELc}KIgUYH{HQ?3z3kpeR|4GNlDn}Xem1;=3ue$DK18|#(lqZ
zY3Y67Q!ElvQKMt-YruJaadSJ@)Jzr&@FU7KqCZIa_T5s=?x1IX*7L<nlha3kh7#yR
z(J()MYA)r6r&{os0ck!Q3{YwTkUkwV+MQY;hm%HeR9j?A>=Jk&W@7w1sqo>gW{2B5
z2x}Mlapi1?vOX`#^af-VVRY+s?n6cEr#Yx|1I!c8VOcc>4KqW?Vxw^ow3GY-0x0M*
zG+fNg*5J9W4Bk{%efxeVN??A|s_c96ereCOan|5E;>|7a-G9A_e7F&s8e$P*nut#F
zXDx(4^4G%wm>{rjH2rsETwqeDxY!URQ^AIsn)^h0!{+8&)NVJA9Nz~Y=<X!WyW%%y
zwg2QT4-fsrIfv5u8>}n~^mJrPjmJ>1yo-(f4vsyS#}YVSa~K-d{wT9uUst74zzk6k
z6a?Es#GOruLZ;C66VFdGjCAAy3@e9?rdBQ$TKpz^ugSRUr$uPzlQEEw{)>bEIz~(Y
zL!@e=vFA78M9Yc#C@@Doa7;#f2v1|y?ew~Fi3SW-!L4r0%n#ob{tJZQMgK7a@sfaG
z+^UXRI$S|X$wiRN$@|mIr8WFP4+tAJR#pQ;Pj#R3+GPO=AS){ikco^HQY6gx4R-0t
z$x~4TtnF5FWjd7rJJ;0G>R{TQueAgnXLr?vL{fKm_rii6Eo~tvvXBrFvjm-AzdO6r
znYg*RdHM2XO-)UBc(@t&FY=3a){49-K9e!_`*0R^xzl;PY+b#u9+S=svFyHatyE{S
zG1r`=u+)NsDZVe3CPx&DRmRPxLzb)@X(FJypPJHSXJ;Q%(br!DKS)$`BbgF)S<{=q
z4|z3-evg~}UO>fSHaW-U#u_kBzgkr~Pd`aBs<F#<GBWX!&HDL7m7(z2ho4|S;sSZa
zF@RE_FhPF)M`Dc2sZze08VyaG!mKPXYnN$jOX_nmu~5H%f40q7U#}Gt8%!s{u2*-m
z^b3!Ig8J7l7^BLE*r;Zgo6{LM1RsDxGBcHU`Er*1{0YRlpT_FwfrD6IpJE#4G4^j$
z1T8!$^r+%}_1!}Kaz@4vMMcj3I0LWs-yEi<g%%%2|NrmG&dIEdtBj2yY@P+7>RYiz
z%Td|zR}bquZDY~EWnK*FMLoHS%zX}`F^Ds&766C`8*l)G0u1}C01nX8-CeAb{iCw-
zsr+7(WxBJ|cqo;9viR-Y^xem^tzkfcJb^hvil?VZLNC10Y!OCU+V$1dF;Sj~9yrWQ
zA=fRU*Kw#@rCkThEwd{tPuu*C0OWuE{5hv#E9jQR-S?7nbB$~nbP!@!s!jWcz;C(I
z;b?AtG3l9c@gsD!7Hys^hb^7g97G0nLa}M_VMM~Z^|_;W)tMpwsc&5Eofoyrv}I*v
z&JP!_PS$!nJUmw03>zJmXYbz0fI)y%an{?5aYeD_(sGHN_7P6^I6-a06;z0dWs>1z
zA#yy)*cpllB#H~t+aI5+l2P|YPEJEn(POSkijYw5^3rvsRmjn5uD`n*A*>s~VQ#CP
z%+u5E9v)k1oGZYKhZ3i{s<qXuH}V<Po|vx<Fyx|?XovKeDoRP|nm}a{OQo@^PfoJ)
z^CM;|E{uM;G>o+&xqe1M4mWCY>YtmV5)*TU_NupdMsGRplYGykMEGB!{?UFuRYf=X
zeMnzHZtVU?wilG6_q`FgQPWZWQ4<?eZ>8wqkNVL+@q#Y>-`HXj?BvAC$|^EC8es^~
zaz{r;;5bnxit_V=`%GS9AUB0uaHj)awx%XzKF5kT<MU@lFE7D?&!~=H!QMbc1%;r^
z#KgqGK_zBpX7DuB*TTZWR_U7{@F4C1&_E{{iw^8{A-zvYN_wZQ{iwZoc`)yKf9*`w
z2tq*~rm(OuB0Su;*_j!rli+yvd;ZMqw6y+p7<k~oJL-8e51@I}eN<GiN=W7et*D*6
z9334KuzbMxfAK?GM+e0o@1mr&l?)#%1F->KjKF<cKeBlqblaGu{P(no$;cUWoSZMj
zjg38n_xASpr6|Tks+Q*FP;CHWpBGj9I%o;#Yjoegzdsk4GMR@Y?)f+_N{RC`2rE5h
z<k|N8VP(C{LjAhaD&U2@S4O%eA5U>{aUokc4(<xPughYCeKa9E{P98<I?;y@AKEK+
zmX>AWH}%QF(+w7DewH_u=2{Z?$jXu0uyDB5Q~azP&$!!IDRNgIhNYwsR2uptK9IbU
z%pKZPXX0yaw&^4wB`FCD+eAhtv9K^Klz&0VZx>Wh@H0Q3qsrN7&&tlu>p{3IKX|62
zBlthK%=73za0Ym9hcU&USzA*Q;D4*HpU!xDK0Nf74OGvQlao91@ASXU@c|a}e{_fc
zV2pni>|%{rcFl<*Ggh<`1$y7z(=!-Fjfn*Iaq`<U(B6LdS1U-i_Dr(8muy`Ij4cL(
zY7WNVv%TpuN%HdYasc6=*Zl%e2l$I*fiB7~CPr?Sl$r`@bfi>s_~KP*(Bg4_4g0Q|
z3&8rOz$bA9A;0SrNSm+s>mVq+C4&CN-n6oml$5F}r?_~IQc;dXAY^zLGx^!lHDIPX
z3xZWBMZK`67EKl<V?{g;oWSh1|DA`0#TH;+q}W&ABI=w09942iDyOqkax8`d7zOUn
zzNkZ-5&_h1Mh3}P_{N5r@hlaO+J^+de4r0hb#zEzyBI40@^P`z5k(*Te3+5uR#rOz
z8V%|}pl|m)&iVR&D7PE|1?#77EikB^n42Ts5#H$r6;H4+_&<;w058XMI0WP%Ey>n*
z8X6~F_xaKhojHD|8;aJ}yA=j4okzGSrdC#{x2LSVzQ?hQg>QXr2OFJ^ALKMLhM;e6
zNd`;h1)`or(DXDk5{ruH-+$#cN1MxyJRypb3Pm<IGYcBp1QjVMF|nU|##3q!imH);
zu<yk0pi+B#GY46BZ~M?+U8DHGUE+U;;4{0k_H3Lxp=TI6*F+CDH%?S@?<p#~dV5)4
zIo<zF=Lrj*^vaFy_RE3l^L8{^Q8<olbB}iiwg%2dJMb?HfXV9fh~WQ{!H=(7J$oe#
zu?a{;XMFD5(k|1Mi<+JohIqJdZDFZ6j5#6D2S<?g9}$KCm5Gmhec3omW#kF|D=CO0
zvf#=H|GP4*i9`H!Y1D6y)=S08H5d6`69Ub`GZ;WT@t0x96?|p`3GDg};QfgY1IiY#
z-<Bg;UlI}`V`31oFzB*mXyB?52hq5n1&RmUH>Ib)9N6010|lSZY7gLqUG}C*0rG@J
zCWsx{6=Y6z25|W{znjx&B2I7t5V=E<?ClRt`{Uq-TL9A$ZI6(OE(1Ill4#{(28M^-
zot&Bi9)wHC-#Y;74A3kgHURHOE^IqKK0Y<U9dxVhEFsraP9rQgqHhQ-$a|#t_--2m
zU#usK;g0)Es;OR(#3Dap_+XwB?{t-@7Y0269R?90Ar@$Mrgjbv?%jQZ0z>LSW<wD&
z>IlI6F*!BW>b#D8vZt8D0OyGW6|<XrWo~YscPqIZ+}h@IVFTU}NZZEp<$Cpd0P35V
zn8;%q3mE%mL=Aa@i|yjgYJOGOw4n$nsKUU{Zoot?AV6GD^39t!boBJHpN(@v$-B+C
zyTqSN6eQC6`EuIN-7StD#MbyjF+^&JR$kLVFywHf`)pReV2Kmh@jGq|JTqG13J6%Y
zV&8?^Xjyfd`}q@>w(zr2<W+O@l}^qtzQuqOy%)Y8G44hjb~wX>dLGXhM0QTJ7IdKO
zNWCIf1zN{(ClM<sRFJ#0yE|W1Nj=e;s;^*}(0c#RKZ14LNlc(91WSd4cDiqL(b`=y
zR;^VZR0xD1+U)<U06%b+bbK*d8IaP((ws`^uD3o7;h=?hMw1*A@ZjJg=es6;zvQ-E
z&wk(^{{SS~SXEY3bCUSBHa0#Ndq_VC6u93b8Sd@u0HzB;o)5?+>g(UWc>^3W;&=F&
zUri&S6JFokd^V|tbAJIUg%>YSih}QPaBw1DeYlKy$pZ*Fbi`3GKQc8n)!&bzDMg3d
zL6C%z9C*H?p0`tw_HcKFVvnE&tTgTCnw)8w-~eJJ7sZW1!aFcI>H2tov$eG)c_(`T
zFC)+>rP`(Ke|L$L5~AAOA5Y0?(5z`-ke!orw6&$J&l)8k`>YLx5L4-~Sqxln-8LV0
zZSB;Zf5E0F;zh6lFOlPzdEP?-+({lB_(2PD2~JN>CDWv?WVs;1vaN9Kq*!FJ?c%02
zlV6G?c%fpKurnC7h+7H)#gS1V8|!jco7ER^Ur>;c-2ct^Z==H+^xB`aq?qAC*9x15
zrLRKZ$o+UvE35HEDJa9X9APf$4m>5bi?jK8uijIptvSpPx@n^)?R^~Exs}UsysiNG
zK0XYJc>~{-^(Oaeg*u5x3g$o$J|1G!RV6}{=l3q6Ae*7N3GlJB+(wphS#UxvoV+h>
z5+{0%1PP>EVHVZz0ho^*XJcv#0%JjOF><m7T|}pHa3wsRho>i?5pwJ5+yK>RWAlUQ
z({+RWVi0m@haEquA2WO!KNlC*t5?qg0ncXCL9wBzs0h2bAP{RZeE%NQ+g;#|1E>q+
z6T)webNp)LRg)m)z_lQAqF|AsE+BJkg3UgGkgAUe@1OB=;M%FnI2gA|wgP(n373XN
zUP|gC*tpbLvAMCa1p=aR=QD<+&;9G0o158LSx3t)0uD=pV^U)-C(q&LJIz->cIj{!
z1)*HBHdr*;uz?BES&_!BzqPrE#QiBOjGlo3(&UT~-3M=g;>%4-i@=M#D+&uZ-Rp_G
z!J!l7cNpSkE%f-!Q|9<hg#APAH8-u>)<#u2o^TWTiploK!0PG^7nlB=-BCQ*t`)D6
zmxJ{pLwCvUQ{Y#!=ssS`D=cki!KD$1G&$mqudeQNmQZFpw&T#|9RxoWuZp<J@n*TF
zHEi<Mzw|jUSf|cx%kr~(E8G%H9@Qk6E@yl1&;WpAC)}fh->;iR{`cDp+u?>Te*rfy
zuTT<+mX<aGi+o{h?3t*?c!6wzZ%1cBu)<15AY~*y_Aw}ogExVsVX4&{&+0OTRVT>4
zJ@Cm;a|<M9cQ-e=7-G2I#gP##Vr~UL@GQWo277h*TtvOj52~l)5*8-Wgwxc1{ThA>
z3$f)_MP#ptkdQ9^G^n|}fX;yiMz{>aLPLYcY;oJgrHLs<j#(HTeR18;*7lDlygXWt
zMh(nU*VIHLg=a*084?;Qt3CrZZGbC-T=&!!iqct^*}FJBP2lAf4fO{_g!kDN__mS7
z`Uvc*%E}0k$1tyd&r!1v5W^v!e(oDL`Y+gH?|K~#X=s_p<8WWUkdL4gEL)L2+s${c
zNjyx)In=2#?H`ei4*9k8_X(roGJ4hJ7Y2r0fwiQUw6v@}J3*)v`Yg=MjIOWGJn&}}
z2V&goKkqcjE;h-J3GNJuk{bn|{2HN2UcMqXb8eh4x%{zx`0q$6ZVumg+6|-ojDPI!
z-M#sWaQr{)0&opr2ZG#LkTuc4KctLkf%txe*XSCrU%w7w1q2};6R(pgRYhkR$Rhvf
zAY)?5Rsl!I!PXY?JsIlR#r~|+b4kIja2g_VZZlLhXX7eKH~Cl+ByQU7KXt*w@MnlQ
zqV9W|#@|4M0%g$-pkZrkYgt)XM8(7q5D;Wl8=^tFjfsl#fkDF*T4Q5kB%?q@S6E1E
z#W;tpPzm_Tk}}XlgRg~9&|*8wvHE-U`2OZBSRF*-f8<W9D9FueX+&x;5QX}Cdw~{%
z^L^DScul{4{aQW#%4JGILK1WqD)tP`&C@fN%k=l`-4CEU8jyHj5A^H+FGoZ~3^H74
zc1J%*1<HjWEO;1&pwlW~%w#*Xf9lnv{e&0l0_E#p#B?A}n)?->fgnzpI`=@Jo?uXA
zy!FlgOmwl?`-92eHNoPX+wbuzZ*i55aotK#R^xnh;v1)>E3j)C?V6KWn-uKEXdQJP
zpUX8MVgLIXmB&d!-U@Gdu-9?XBcaa(u<DY6<pMzm$CNH7d8=1cC;6}Q)!^XPJ&r!G
z-M1BsWQFL(*<khQ%LouUP94z)xVHVSCDkWhkL@vGs&|i~E$-q1veJLz12bv8z9^Kw
zRrIs@Uc+Lg|BR6RuaEqU?dOMI8uIen#~n|ppipkGCmquxiRJe?G0v-(B<VVxt2Pau
zfxjzKW0LF$E=EQ{K`6w+#AIh-p~3ykV>yCogQ<jI3x*~MISnyVjoN&WV$K)pZ9y^N
z2)aVJCXNysQPHK|-cL$J@QOKnwy6UXvH%2uLi?hrzFQGi16P>J_to$NECVxz^6{YR
z4Ux&Klq@H}#}^e9m0S;g=IiSV_=W_p%LCm6aeycT2a}K=AiKq*QWACe-Ef!aoxf>H
zO(k?PO<?5XBk;n(J_hj1;^Ja7gYq3o!pj{ucXz%yM-X}Y`uc#z<c*>toCsnaNC+-V
zP0oPtENG^orIo6VctQDl9oBimsFDQ$2u$_l8WnuU&J>i<C7t7EEV~jT0S^a%%xJ1p
zNOPJlUU_Z9_Jn`rW-fQp*9sTGRtnsqvEVKToYEP*er>m-X3E6{%@+9s<avM0(nCU8
zYid|{YTLoa>G5%OW#v1dI6F8%#`_$CioSYwvkJ_Sx*F;ZzW?~~`1pX5r^?57;U?TO
zH8s|4BFwZmmY)JFw+N9G00U83Qj(HDCFYa7bx7{<cpv^5k33UW>OXLyk((tfTz#0@
z0115e4G(HQ;TEiTDR%O_Jo@0{gROfGfONS73gtB*jf4Lce{YdE7JdysW7$7Ge({q4
z)t*%V2LtwGE~O2g%+<~~?Rd6_%r#y0&z}dN0@a(xpgLiqUSWBf9bR8n!s>%utoTK$
z7RFaoHoI4(wKtyzZd<;ZCspnLDdY#2&%@X>mxGCpD<=PNH--jdX!Smu8MYtq3>K@c
ziE?+-{PaWAy%)8%P=0o9?k`|fC`AYt%4WUX-Ra%k!+;VemiQwH5s_liOgu3T>Id>z
zIUTL(qT-yI`t0nC+mD^k{JC}gAgw@Mn169Gb#fA9KIA7csu7N}Of4Xg3t(E$jR7CT
zU2(1n&?5rNC@}0905ryp<N=x8<WKYe&q=^ZNOXYCD;^Nue-i0|C@y<Crl0`GJik51
z3rJHi0I>sLIQ=FPityOh-05}z%vq;r1g1$6fsa7a1&RdF@MX{S^LXMC9g3^#DXD9u
zp1IWJteM#6Rm41cHobNlm{KF<V4ypWX@BG>3sRsN$y~+e?=ko}E$~Mq=J<2tYDnkI
zjH<5g8pv5+zU+G(3a5Q>UK`HL`a+8UZV4lO+b>^F)zKW`-@hqFMx<$Ix^u+a06jq~
z_V72BR1wTHUmY!9FsddpsC)qY{yfl#JjJ)2J+q~;MmVwXoJ4K=5rkSYhgD5wavWvS
zgJt5#R1g$GuGt$$<sl|0D`gki7EPdrUBzlrsOU)TFYrVs)O5Z0?2Fo#{}|)ET{>M$
z?U{u|J16zpMNh<2q|9sKw(-@j%jV`iYU*HA4LVQ3rP1Tupr9(9N)-UM2z)(q02SIO
zfUf&UHSU6;2B!DpD)_Qr60;o-f1yWuLOpcI${mkYst*JMSM23?7}7rH6#6hXvth#H
zC`Ktn5mUcReB^hkpXo2z3QPt7DtMY`i12EYT<J>S;iIq&Y`Ua!T)g7qBK(YBfP#;Y
z6iBRydzzn5fD)#JH1RMUpPXmxC;oJ`wW9=KsQLn;6tBaQ*QuF2P&vA~y6T#Fg`Gg9
zfni|2j!B{n8J4tcUrma&WAqHNkKP{1UMsGLoZ-jUr~QktQ5W<m>=3(^;v1KKb-Y{8
zcbe9LBbB|RLn9-HQ`RnN9QG~lJTK5&0PWDUJIP3cYU6l}OXzViQIyTkFa4>zmBXM|
zNGKLayyOfa^7e;tjSh#4wcAs7SEX23(mg$u7so4ov$K|aQ$pU}vP2wYsTmEY3Doc0
z+<3o14c-Sdjpp*`HBz|lj14L#1<9VUTo~>Nt`Hq_Py|<UyGXJBT|*~TkbTp`MVK8P
zu)GHkja5q+j^pRK(`RW8;zI+N3ORtqW2y0@+Bde)J{9<^f7ir#`-g?B<*Y6)N~0HA
zjUmiYdbuxPq9~Xpaf=jHRPCYpwT?W^)W65UTV-dTdoGdDj_Jnak;z0RrtoV;9DmYd
z_B;)W;Rd;vmKqzK3YOhMdm<cuzLzp88rrS;Er-G|fW|^V3TCc)wp!4HpIs$!x>bJt
zy@<XjGE#;ZL#CE1F*_;}5j8xNuv)8yFbVlG%VJXh?ahng<fpd*^bn@Z;p&=<f**r=
zwH6+Y4uPemd{WqtYdw}&WRb?)yM28{FDUL~*WZ6v!2%i<;E@MOV)P6uVPpsg*%|;b
z`iqU6M~kyHib)ooIK^*tv(j{&U{IV=>oN>7!6HSx!=4DwsJ<KXj-|4umP!QuwgCT(
zF5}c#2}deFqgI2^Q4~Y3R}()DV;-|@`$la8iH(<?k0(8h;s{ZndtWgqB+7oL%y1MK
zC@3lzdS3`AtSRa|uUX<hl<&TC>ZaVocw&{xQ0(0llZnPE3V4(=J@T;OH>VIa6cR2f
zRi8wTTafeeOuk4iew{6$KO555J7O$=1Xl3Dg;71XcdvgS)%M)CgB=dpuGh4#*J6vq
zoLbY}3kP+d;X6L(mSAT7#ftm6{f?F_{3FKizz%N(Umv8WMg}W+mtTGyI(bD!O4wR<
zGNJoO{mvwqD15+v1s5$0bMI~(WRqFSv9`NTJjOWIW;Rs8mo@0h6%szT_v_oP&vR?8
zKZ)bLyiIeaWsjB8J{-m%oh+&MPt-u#^5VHXmT34WJ+|>j8tCG?6d~VL``i7u!^3a&
z^q;0HmanT#QLgVecjufFK}<-bj}sS1sjuf7$wv3OzrHC^KjP<~AtWpZ;^djiTnX>f
z!SnN_#l{{pG053=CF=8Er=t=O)87~ga#Hf>=+T&%5k6jT4B8fuqyn;AngU?{FpoJL
zLJ|T>=+>)Wo+z?uRpCu{du?qi&J~i8e}-4FalRo(JO7<-w^f#wC90AGai^5~QPowV
z<|Z#}OfH+Nv0NLI3@9{WVKhSKRfX^3ImqpNJc&?0MRBO`1$|5=;}6`=_WXhIx6l*)
z2c46nzvJ>u@-tTTYGudvPrn1nR=Nfnv}kim;O?WIBD$Op`WBOWzK?_<+mX$kiG2Ca
z-)mX!pUA$E#I>&Xes}sE%mQ7e5)FwrnTD#*o{v~*YA?oQ^O$mVulYS&0CO6PA_bd-
z#5Pde0c^T6>eq*y=H}BpsTpE9&gd-a)JLutlvwfcfxle0k<}nI_CwOP$(7mx%o>9q
z6xbR+<z!|CoNbx%XHI@sWqdcukJ@cbe3NQsMf%dcuHh)~-s@|D@+qZm9~o=Bfj9jD
z*6&+S|1Fkg!IV6BD5i?g?{uED?(T{oC9jA$b7b=KbSlN&oU6Z&Yf^|si3C0#E!4aI
z{8_xaV^#GYgNMgdN5_|)ot$c7d_4Z?@y<?43N}^R0AQ<_7*QQHHD3D#!Chh(UOv9b
zp?%4Y0CGlE3Jzu2OPHyR*EOH<_3dpe>3dIsQv+CBq`$c9)vkuPghy?yEKk^QdCA8L
ziMKg~UWr$+US$X{L(xAAKT8YH|MQ+S(^bAF=9($1WIl!u3WR4)b>ifM2gbS5PAcrW
zw$OL>J?>@2PK_pMDH475vggi^GSM?vEjQgg(iV<>wj~#{mdv)r6N)GfWwABb<hJQH
zr&yEqa2^GwR_k%#$18~|sifd-m`waZhQ6kY!ow&t-ZACPbD%HJq*sP_aYYAS#3&d$
z6Rp4U(`N1L?a6lsYuNgqg=a3_E;drq(|wRifZxSUw;1B43!UDXcqp9M<Soq|NF+`x
z6Tg_Nc10(0I2`MWq5HG3F$$EMIU=4kA*lEduvucgx0!k#`}}#@pBvBe2l7bcbuqxq
z)*Qe44t%Dba3KiVS^fHigmE-t%I4m!-sYCfZ<*M3Cd@OUhdu#yzfv0dU|qVl2YjeR
z`}ukC!0^)VBG$-_5yLmlSRHG+0va3;&w_%-y@|u4vqmDy4xyK4$(-2b*{_^-cOX$5
zhQn}u`oJ47fo1Z{ZgBkr2eTY9Y1POg@$A3z)xi#qO!nh4kqMKNV#RYWC8L7$Sg@VD
zWpIt=H+-5c=QdINCOCFd^v&<jezRR;PV-=rJ-W4ETqM0e<cX-2P=n7{^UP-ZsPOtv
zv^K@U-n3KSoeFY`stj6maGHxO3{THvs#Fms_iZGP;-$AwSI97>gkh&M-|7od!_wT9
z>g~}az+j|u<`$RE|DAL-(41PIn;ZJz0F?28B0{l~vyQ1>V9v7@CsR{)`im5J40B*s
zYPBb#uJ|o!Sh(T%ga#m_!i$TGHD%Q(;%A#Lh(%i37aJ>)Vk`Xbu-Nnoug|T$CW~(|
z86I8B@s5-Kwzfm==08x!<-_CT7Y;_v3mPe>6z#ZZ0jJ=jqYAnks1Ec2Pp$g>*pKP2
zebJ0ULc<Ah9YLCKHa@qE2%#Sv9U#IPABf$KC|>tXC#Rp;H{S}+F*V!$OS;}cUMN0)
zojvOC0*qQ=OWicfa7my7Z3q4<<(ryHYez&kZZWuXZHnmbJ_MI)qrG>o4j=uLY#eYb
zW#{<Z>3@eShFS~G+;QR!>ygoXcL4qPLNGLTX_)EIpI<iiGm*2b>hJm;wMi(Z@u#qC
zch|GPo6~RM$ZF<avX>M8{#_zqwE!ANMlRT4q5MZy?Q412J_$cB^GSx4)zvda#gDSG
z^gy{qCfa1X(w_bO?O9ypoX1l@K}7!u68dk@V=c^kM>k$A(JOB$_=g9q|8ASF^{+5m
znQZ;_JR||2yn1>Mv@{JbNM}?RROhfu?4j(Jf-c@yn^yF~1iW9m`ZRji>@)1v`3t@m
zplef`L5J66(Zgt}n`|Zn=wPZ)s38#e?e5#+$RHg2tUxZ&+-vbHJzRJj*-Y*(k?X>7
zX<gayd%dN;(ar%tpwql+t($N<vleNkBY1Yozg9}uoH2%3S**EiIDnY0Q4&^BZ}(=v
z(3umVE1`c~&uylBaP<5_u@x@Y6;00Zxpz_y?Qpi@+H*vM;g}I`6)lsAfWU2RP@+;k
zi}JL`cJz7w-YnhAHt_bm)ZMnlb^9I#gE%oWb0WWIqpAwC(XpRW%*Pqp+XdXPMFW;*
zdwUN$gRun~5A802$gV&>-q7da7W|<4jTDLssOH>S@-gqn8p?Nn$L353t;LLu)wZ_>
zu5?g-tEj+=qQ*YCxUj!}SSBYs!2YCP>Zxg2Y5;vWf47z39(eW%G46r&IXuqCZ-s>Q
z0XfKJ!lRuMGRDSv(b3Tb%5XRF@-L2E`7+`zF^}eFN|HW5Za~Om-T4R%N=T5R8b#R%
zqkC>Hm=Yr+Bh?G#n47B-7Jx2EFWN{W?ylzzi^^y6<yPsGE!i0!81i*i^RJ^`Os!%w
zLw&di(i*1%f6nL?2<wJXo|ldY0j?w5OCCUyq?ELJ?bFeVk-stqBO!N-^*ltM0aKN`
zy1o6l@rTrS<ZZ@|m367OpBKJVRSN5apMU7#!L5s+pM3wv%gZE>!v#`&x5qz8*e7f8
z#6ar=>F&2I)oir}Kwi$Pk9|eTaDflgSY(_60`F8EPo~RSkTG_i`?eBvhlJupIP{h>
za}A~i?T0bqd&lh6`z$?^O*@(eFb$hwt5;uF7!8hYkB+cPirEY5!2(+iY&1Dp=D7~$
zZ**tf&l9n6+`rx47`(z@9bjJSV4=HQ1E#SbkLlrf84Nm!8eUBqa2L#GNolY4{-{PR
ziTZbd_!ypK6O#>%1o2PVoqfdNsmU%`epjITwctGfvQMBa(L)DjK#C)O$y*M0?l^B=
zhwMx*2>N}N&faM&VFfqQBY*?44=OikZJ!^F&+DNIwbn98q&~0=Xkyub0ip+F*25X|
z6^3Gq)9!MbQ=!lCQu})mQBZO_0>t=*g@5|{wzT_;fI%HBOtJOVs=dJ!13)mE)q0bV
zu(WvQ-rpS$Twi0Og>m~e*_LVlUTp9lgqJ$=TJkt#@YtQOczPtzWC2Ff`ALk9zkeeu
z%QpxlP`B4Bw61GR$X!!#Ku@d$!%d0DXq>8_Nk4;@e;;YyUN<W#uh`*!gnuG`tZ=CC
z6tbj~9u7=w4yD0#qDcisAKhn|n#LO%lU*j)lL|#Hq@saGlmwU3Q?*^mXMeP7Qi@s<
z1^F?ff8pzB{H=H1>0wX#q;Fb7Os%gfn;loo<&2;7iS$7Huglay7gN-93d@dXr?r=l
zQh4Z1w}`rWo6l81QqBVn-+ph8<u*l|TES#|%lr$gl@H_QKS5<Zgiw*uHk7^fmNeSW
z5%><T$s&dAwJ~<*ZzVW_-qIGZIP9x(9YWB!(Nfa9Er}go4?j;=*|ZYFe_`|HmzSf`
z0OV`JyT6Pk{k+Eo=zT4xnrf{8)aCKNz5V-_6{vG0g<rp>Szfw-ye+!{DW35yNwfRD
zk3i#Q8iyEQw)sUxt)RUGA3u`*jwKZgXx)CeQ%p^@FV(c$9-+4!g}}~4fJ*))O9~i)
z46n5;#Kn!<-P==`&hWYOIKPGs5n~8jCA<8(M?lezCE;u8a`U9ES88pPhX|41g#@n`
ziAuR>bI3_`)teN2Fa7&%lQ8X87^C`dNpIZz9*G;)o=U{95gp)GWGX0EwC<gbv>NEn
z&PNe}qNyuukWQtLNQLN!lO&4*8PPDWKAV3XtX5VnXU&B|j1VfzImkg6g~~35No^El
z%yeQcZp5^K&UZ1jb$&WErlIdTM}VW;Q^-@Ijx+w*Gv{48FYr6%J0I~%!+Dn$pOpN=
zVM?+CNu`fR++t)0`WWclx_uzzdZOp{oiN;4pL8(h8588Rvt%oQ1z7mb0Qk72b##!l
z?lPhH$Ep3XE}NW|$%s?{5KP)^Z$ls2BXc#niLe!}<k;Lv^h$A}!N{KGn^qkD%%m@0
zOl@o;6@_n^YNmR@=;8R3ruA$Ek{YbFwY8u?123tVj#){;Yq_b~BZ=YGFJL#C=y+lh
znA-60^YepYjDcZxZa%(enYHr=prn2OzV4ewsd;}NIvN@--U?vEN3s`pc6Ru_eK1it
z#BtPklX^<W7>XN8`0{MRdD|HoQ;?Q#il8Z&T9xXxda~^IL38uU*n5rEVD~iq)B}S6
zLB_&o1FkxmBhut|6&X$L3I509uE@vwFEx^lxLso(!qXp-*)z<(0n%5f|9AC@%<LDM
z;GJB|HBKlbO0oJ&da!<H*6`WohQ-#T6>-$MG}-MfJ7o0Z83P;1TGLb<9bE+S(Oc6o
zVkKTMnzcgjfByW+?A0!vuE1XI%7#2xR;M=MzX|EqI5Nofw#T@@H~5?J48$uyZ)zcf
zqM}IrbEDZgDbkluS`Te?ModUuOask$0&mZG8Ly70XrAs>k4z9Nv#_T>{tNCBM8`F5
z<B8Q|bVV*gU?NEQ8T9a|{VSEm&?)f~s^+au&U~~+10wTcl^>Gst;1`Fd^@3=3l%eX
z)c}9BpLRmy#&TcN3e9b7EUf3{*P!g9Mc1>Y4HQT-<uc!M^!bYlt+PMNJSfA8d92!9
z5hU7FUIaEU`d$6%m<=cQs@ineC=_FTOs|~@IF2eT3nA27baUQYFLhk9l5Xn%V?YG9
z2IJzdUkQtQN9=d4b6#y)60wn2Lyk`uPJ#v))kkM3oTW>rQn%zdq_xXRYFb|}$<h?u
z9~5(!RO{qik33&;?pmB$Qs~6f-aDK!2%v-b1zRNs&P{)6lo~N*baSlSF_}RGCB=XG
zM7361c2FvG9k$3swz9~S%pD&+@Bd=LpC(r{QA#ZJs7s9;3#4Rq+7Y0ur0Kvt{(;b^
z|Jrz!gD5S`pt&wZ;k7jJjh=q~r1kjtDA1yi;o=tT)ZF)nHR-H3kSBfi{Pexh+*KNc
zqk{Vnt?C%;Bb?4J>wavbmIdSaI=Vt;mzD<Xw?)xt4DCA(VZ(IaG2>`0vDT`DDx}-&
zzhe?NI(hB2xZQc6^XKRVT+ESsQ2TC?+;#YOj)Voz3jEC6(@EB8T_4o*kYaw9{+wTZ
zQrnQU2A}M;haj5L8lmg5e8nMm-uj2!KdlqtJi*Wi=<p^wxo1w(>4y7QxAS@nU!;)Z
z|2>NtoMVd36UjdDImw&TS4@!l$V0_>WeT$b!P~hC@BU^^h%EeuH_q!j`%t8}*cd5q
zE*bg)KSU@Zs&^+eke~t+5)Cl2_rwbMf${UJvgk{rLn!;7W?rYf6wTI-7n}Q0(3V_M
zED|N+!R;q)i5%WY9lk&xcO>LckDF6zl!!m_@vmVO$N|g{$&}@AZ}hNF<{g*I5$;D8
zri!>Sr?v5q(J#IQ4(tillr6@AJra)9v1x}>kLyI3sw&s73d<H72By=b9hN&zN*1d)
zvhOmks`rcY%NlH;TLpfXanXi-H+HEcL+&MM+O!w$@@a${_<yka&A3J=T5Bx(zU0jv
zHs9)P!i|!`OW+*;5$UNE_r_~=S`z#>5z~PTY?B!03?9CI?MIllI+pv{^I!w5K<sM?
zQSfc;P;;iU&^(LNviUi~4+~KLG#1#>-@-tx@aZ#kp!L-0ZqPI`h=E|#7&m3)$X7V*
zcPFCgc%dRYyE_Y!%+Yo-Cg0Qjdj8gF;b7lHha&+wEA!3yrm|r)VRn&gU)i%cc0>Mw
zRtG=M>DW{NV=eGcwYEnI2#%L%E;+)2)8Z4;5J!4St!$LEGe?GXVwSCVic~W>%7a9G
zkFr;6lj~idQp$xh1Rc|>Cuw)aFM6fN7fmXAQG6&oR=EBTTW1+oWfW~|X{15Al$1uK
zq#NljNd@T=sZC2aNViCLcXyYR^rk_&Ti`A{=bpIF^@l%<NB92Lx7M0-j`5Brmpc>I
zmu-7&ixajd4?g>A?L~W`sI!BGGUP^F8@b!8%SVgd+1T%@nS^hr6he6~I1L)z1QkJ-
zboE(J6L1`!f)1Ekl0&WVtAgnsIb6%`%-{d&$?uUpopn9@xFRJEI}9AnD8O}ap922u
zUs|Q+D301=h>SN*z*q5XBUwiQ8g0(QznUG>$J=8Dd_2}F8X7n#&q?2hg`IO-YyZs1
zQ08jU&s{5yW2}QVya4WMdU$#U9Tr~|u7sw)1@&_4n|JqSeGwAS#tX=mjr7Me#2si2
zO?@)|g+_Bu$8^2b;i~-W7e{SM%If5trsW~Ig9S?ji+{KO5L%?qI|}YVL-7LfVc-a!
zKgV;#SLh;Je4pZM%ea9`b*UG^OX3^UBM{)6)M_F6eJnV0UzDKjLn;o77Bj`}jzZ*0
z2Zl#4ALi@lIHZ3jQ>w-E`}PyUY%Hf`uHCXkcR`B9koOuh=708K{cl*0Ri<f)X6#QM
z+Ist6eldEYzhtj<Of$HJ&k&tFWc`(C9uEsE?(YXCt}#(D*1KC44vwj+nu88Qa|&cI
zst^_vOA=K^#IJJL3CPa}-Ef0vYa`odZ_xVelr-7IGbem}K*FepdC*t6WOBKh`gf`y
zlDsp+Di7^I)2YpT@)IIrIJlp%B@LIQHG6|6$Qu=WRk9W%+JSin$s#Pw?%aqweZvqE
zL{BX}k>v1ky*-{t291GiY>Y*l15&f3o02hkmxb*AN(x3%N`zhx-}e=A)VH7P`Kc=8
zs=+Y$uzT?b@x4El%?YtQGlc1ijJ~-s(z8R_Id3vi+I(Gos3WiLh6~6Do(w|&^}U2E
zNA{4Y{k@(cGtHL5;cRi$g9oo0H@LgyRooQhP-07Ko6#32?V0md2!RHFus6f|i{Ye6
z7M2BQTV(e5R7bM&VRwDQLHnB!;aBJ9sa)n0;)-LA;1_$kbpia&bzf^NlmIR)EcyOy
ztN;#)(u_k+UHxWbK>p#OkJZ5QqYd;eE$z<P_WopEox_n0{fpD{^WmMz7ND?MUl$`p
z1AC!=BO?^%0VY(BAdvbH_ql{n+b??1LcV`VCxd<JCs+mR8otl#AS0wtZ=ma4^6C*O
z!pJ~y8xnT<GzR(CcHss0AmS`&2sgqiE`gVf3o7}yc}!&F5JL-M?eG*C3s1X?8W)me
zh^>&bbFkO7<aWi-J~*y1!Eqh_e2yf2Blb9$uxU#poPqm_D5TCWWb-69&|jjl;ik>q
zMMX?3W{j1SgOtm$@ZfieK9H>wbC0f{ZHMwX`AJF$G&mp4{>rEVK`56;%g122$5>?k
z{COV}6UZ=Mw4PzX<VxYTPS!O06L7MQq@~RZc)~5_lV9OK$zpcQ*0DM))KZP52a26&
zI7mNa9fs5~S^Rt|NHL!_$4wadv9=keR5p9cQ-Kr2E#`jKY@Kl6NGw1H6%pHR5oGdD
zf4Kej5~*ZA>y)M8JG)WQJC{&9;+n?AChwrDHW-I*J-YQry9A9r#+?(&Eyk=IvEmk9
zVd<e)xAja%*H$W0;*pkf>X;$Un+2dAZHAm=YL@YE+aA`G|7qXca{RMWZofToeXU8t
zjmT}0ZERu!VBzHOA~R*ncvxTfSy(=wZFPa*Dj`<~YU+(OZ#ZnlA5z_4&&}$7m&m(&
z2n=?o06%P08sEP-j#7Nkn%1Gz#HL(xE_Fc3r(+<pdlqMbC++Eo;mhYoeJF%jNaptn
znN5W2`#e%tMe}ecvqIAoPc}Yq^rJ<=8+6B-j@QQrML$>JOBcGikEkdqCBunx{&b2G
z1Dum(m@5<1xlG@goR4to(d2+xcp>u@-)r^*c8Ixc_vR1ZfG<4LRAK1i%_1-MwyI(`
zhMW1uKv1O=>GD#hEuU(PWb*yvTLf!niG)&{Gy4-LR(j2)bnSP=-Z0Bg2t7?GY9h@s
zG0U~KwqAFao{UB0R!(4zWVb!)->-<Md^v6aGExxLvbH<j4n~dF*VkqGEzWD*B6>{^
zoJPG;czC|;?JXiA+iUA$-;%-zS1mq`TP!uwAX5%G{sR>`Ql_kvsk>@ZV?_zY>CUxC
z>t1RJGW@)D+xu(y91GDqNM}%WQiWzP40ci8j+paIOvA|)BV2*~?W8FkW0a_}SpK(s
z4*mm=03$bof6FI>jSx}0a3q-nAA5P=vvh`%luvW|IS%lnae)o0H~(l&pufNRX32Sg
z1KuU|+@;jIY`9u+L98<{jA)GBe@&0)p#hIuV8yTUt*=PW4J>vF>nykPIVkNzSTxg+
zLax$td_W-8SCetdHs>Qs4$hkPWW6SZnQ`TX+L4RvlcJ7}Z6Wt}ZB}!mp!x$&f%cA$
zhnut5{Cv29f%LnJlse;~OuO~l<+k+M2K&7#)1&<b=i8mhHz)`sg6>ht#EcvcSq_gY
z^;2Yg4|5e%=OZcXZ|aku8u5;L&0aXux`+W;(HIsVheTnoc@(;kvFT}h=vWSF@T1;E
zgknegWa93y7FXgf(n)qGp;^)B_$af5Fd+#be4-nNTg1p91d~b)JN;lQy4Yxx_2Jf`
z4#C&4(@Fdh!ehd2Jt7H`l8uy~?jvguvyv0Q?UZ~#>1-O+bU5`mD`V^LUr|X-zIRik
zDkR_5^h73P5nhJ+<A&6?#N&j9XnexF&~J7;C%z*S2p|9Rd>qrc^uo;QBkq^A<|t`<
zWlkD?^LJ}{-K!C^kavfA#GxXrn<spAX2d%?`J;t0myjS(fjlzdy|CQ{nU-}n=sce8
zS%rlt*MBA^-U<jfdtQGZ{XF5}kycXNNXF~1vD%gW^VQ=_*$dziX?4G;1_YNM{4O@z
zqhBxrXPTVfe_R0tH_(Q-ekmu28$`hr@>&rV*Gh~mEsX#s)b&1A;BsA`t++r#gv)(V
zL^EZ3k~?u63%uU~kb-VQhkeV^;@(Ct!CIr2_9MN?N9i8@cPbyDE%FmB+wyY6Eg`bl
zC;obY4l0u>5=zwE0zwa$su{u`ZI*iL^(ON$4T7aBv&+@`@oEnzzTRjVq=t+vtjxB$
z=>!j}R-DJOk~;}H*$1H6?e{TnmZWc$m1fnROubc*z8~8Gz^bR4Q2UC-_orPRrIMN6
zbjMeYRLpQc*?FQsH%nLY9{<RHJs_xuS~Lm@n_bmNP<oQK*N{3^R(oxM{=OMz_3B01
zWqRDD&#NXA%=E>km`qu<m*C1|T962VGehqDKH3*QjsDK(O-{-;e^T?$UjrN{w7ePa
z*JWouOR;*C!(Q7!z#w9W*KlX+BLa3l%^-$If?Ev?Ktb@{<Ul5qs!hksxJH(iDvk^a
z28iTn0^C;kopo$g?X!&C(v||srY*8p8$Es(lmAHOwfOkyJnnjHtCyrRq2f%IzxSFj
zkt6q`mbQ$^N}OX6X>Dt;qWMA&jcO8`mBnKT>g5+1m&pW0QXW$*3d%UO#5>KGEnch2
zvu6USI2dBVcFar+{oD1AIgW+e4AjbFNY271EJ*)Zzi!PgbfT4FRpW-Kn;XebS*!#p
zH8b{KX7AHrzGWlz_&n2`<)nT~zSjVWB$LZ{m21sJhihkJ;_9g%A;Lb%$>-a2zC(|r
zrdU7{3$cgzMCn8<Le47<@_xUL=QX3(+^i$-e*|{!)fb;l3b5YOi>eft$>n6I6SbUC
zeS<VLFw+h=gR*YfdqPRCQN@giBt_@L1=v%VLAgdx=Z_mD`~Hr@dK@DAXZK*}a+&w$
z<5dZ|fuJ!fN!56L$*04FzcqnN7s<QzXxhfslXEUZVee;fbP>xi>MI2=B}LD3>U}Zm
z1nRMx?01%%Y;Ab=Nghrd=ObkG-CLzpHJ6P0Ega>iYx)*LztV_}s84-hd~MCb%!J<D
zn(?MCQKl;<-BW9?-oqpDebw&(N($wiD@{}IA_KPsNK6nblnYcH2>!5AjeW(b*2>vg
zjTiyoQ9jo+oApA&*=w=0<0E{+C-GVC`3<egRYCRQ3<`>Ou{B>Wv-2Xe+|a5o_?%$7
zlBOqimVS1VeAyp;@hcuVS;)rqarEEtEMcvMg+k|9x0}^H-cia94P0joHe$e+V68q4
zJ%^$*g0wU~XTaq8lkU|lL8(l3!@v95!lIanz4?6@q5b|Wn?%B<5IFsf2$NIP>mkP$
z>%Rkf55jPc+6ddeKprault?d}X7TZ`58S?fz|<-FTP!;B3EGu=4p)}z)t@}|d*G_O
z618Jya@o|8Y=YwO_vDYRCYHrF9y|3HH2(Vs1kJK2vrOvodrGX1)^RnUqT7~*MXH>N
zt>ry%NjnzYkQV-=&z2_b<}NTkcF`Ws&X_{TP1{4QT($(<{eT4WA1wfZv0VPq&1l2V
z<uH5A;dcK<#R#CQNHqw8MWRmpyRXz3GDk!;8p^I0Y_sf!xG3sg(y>iT(?R22=jA)7
zc3aP^3ZPe87#_=tt!%HgY}x(3eg6SDz24ULcT4NmNo$Jo{GC+@103sC@Q*OdagjV;
zDx`g&kz4Kb5Ah8B?q%oT!2fk4C{y<2<%N2)a%|`3v4CW8)?4#QO@%i$mf3})u$ExX
z$+P9^6Vj++@wK<sULVc4H&dhtk?%aI((3P06{f&}`_<xQ3$ZH!|Bp61?S)bk;&}@+
ze53nwGKrggGJ=+nvpFWt_@WVFM>5VMDjOP!99%z9qhE%ly>|`!XT1B+-a$0%(3z20
z<>MU4)d6|~kxK&+f~3jq=X-e;#Q(1Jh`?PKgh~o!>NcY%!<=}E<gn|%ZuuZM^l)(I
zr!N*@%l8$hYp1-gNhV0P-W@-UCEOde01<S>DQr*0p!oQx11qhFC7gfac*k}VAw_H_
zlr|Tkb!}eXn(hpJq=c!>RohY&@e&lKSptQz9v(}iGk~T+7bCDND~keOv*Zp8N3ry7
zwSr#xG@c%)5#WZHRGBO_Ivs$~cYfy3>C-oey+e9bl;QlKPw>0KnGeq~zOmMoZ!dnR
z&qkY|wpu-J>&(@)_F$sZV}?J$yXXxX$=e2lZf{0f>l7BDKYt%?@;EOSl55N@=6`?h
z!yEp<?Gid^vohwE#f48$4mLNwMT}G-2al-b;*J<sxGY3du8=Hm*@srJ;aLvh_qg=T
znY;YBOKrl~sdnCs>c7{-=(U1_H86PXCidUE-c#2OA>=CZd(c&!<>)#hoFW^WC6#3N
z&0K@T4bMsr*?X$Ha8im5O|Y*eLwpg*>r?t4G{KDfxe$RZxyV=lXpI`;a32+FSgjKQ
zaPd^a*FCtw+=3Vt6qtiKfs(p0%>Bw&1XtJO$y{{Pw{aU9h?udkuoykv)c*bl65rQs
zRbHuXO!yAy-*Tb8eb|}Y?g|`(Q6}P_1;$ksV&@VYcadp7r^(zj&<e-ghG?SV1zTV&
z-bw7IT<4pD)8pdT=nB*Q@!J~vwo3<EDrTN_`OYTtT8+}KX1N8T9U_Z8q8%;F!(kQU
zOuE|}>)Z8|;GCXRg?pxX5HINPh!;6@FuTW^Aq>Bp5$*0YHD^Y}?s0Z#-7}G4RN2%S
z(=@i+-J(9OR6{xoZO>*C`To)%$HaZOcHcBYwSrfLM4gxpx(T+nCw$xB(*zJm`mlwY
z0sgNZ5?%6F6m?JED0h_HMkoo3a(Wuk%xvvm4<yUqZjF%8#_$++2LUb)kVQB?)fh+5
zdUQI=GLzSBq1nX-UdLzK#fgcgM!n~txvbSVB_byFK6zT}Tjsm(_CMbo9ttWeN3O54
zQb@r-yeTD!w^MOZ5Y!5z)v}cfyeFljsic1}xiIXJ_f1T)yPYS_W%^Uh?m}C#-L5d4
zc0aQ)t8~Fga+WDDB=)7q<jE+wg;4({u0Q(p4sqiHlZSB5I*oshO;>ZEBFUTHxp>XN
z*Wak6?{d85G6^LrH7f{<F@N~Pknw*%c+sQ85D_XR>UqlbuEN}bK?G}M7>OW=DVO*i
zN<fxqYk7p4ggVUs*yf%(oxn&ATUO1%eh@>5oh2+x7mfHWApS>V{AUYi$4wkDTm~pV
z`4NzAQATAmXXoS?n_V|COL_|*18Eyq$`WuFfdPnSv(5q(O3uLpZg6YBuyAmoH~CVW
z3>@c%JtR?sRg#}K3J_4afISjhVp=#Qsf+&P5}F<x3q+Z9p7(!y!{r48X23wYr&phW
zSzfP}slCgnFyqr%<jWXtTMPxov5J}lPSeqDpbg5)&feYQ=5;;KtgXH64IiDLh5}dJ
z`FXKy!XaRP5#eS@gnl~Pj<j=3qjkIpfWtl76{m#C>b}HI5cm6|sd;;qbqFVhD8pY@
z*VmsU);mTvil;6+dVol<`m*n2D)zi$WE*s(KqJCT=-@Q5>f^4K^3E0D>71g3R^Rb}
z-*N}zKJB63-ByuzM92}n;pXE7GU3ype#6D>D2OJ{JC%H<q*mGJTg0H$4BT55*487d
zU)qhx$jA%~mWkL+eyWdjt7&UD8uo;MvyxIJTnI!NfqILfm8bkrA&qzBhX6Q=jJ?-N
zzMLTb>H<oE#lHrv`~?{>r6z{$&t89FzPTO-NQ=vxvn&1PcQuARrA^N3hs$mC?l;LP
z+(ubhG_6Z*ZIk<RS$9`;q+C|Ykn7&piqQIo8y8oFgXtpQtgM-6nwSJuXGDU8bF6%h
zVD8h^E;rRlYyy^jQr>Fr{S%FMj+)JI&WMdnU&q;h(FQRfvcxP_JzbE~2QnFp7-!b6
zO})Sx$BtQRW<_|?(hiAb{Dj)m4wDM_b={9DY7++F#l~tWIKhRNj6XT9o`ykR{}^}N
z`Y|EO@CeN2r_pAixaIe)F)wUu{)OSzC<Z6Y0w5O_2s@{xrFBdau<8Rs)9uX-0uquh
z9zab#xjL=ffz1uboD!@Ne%sDfZUWfKqpdAyd8KohgLvB^fPHcX9!J2q0#g2On?SdH
ze07DS54=sJ#}>dR3G9;Xstj=`Hu!)++Su3_)J-d>rK96(J=AjtMD04YR@lh!AG>bO
zwkLPalOMkAB7;{X1!CFe_1J2?Z~~%>51}o#c6JtbmoY%W4M3!~SIeeIn6(O05Ra2J
z1?vM^Z*LJsjeQn+7x%5<O#r8awt3O<-hL1k)>4_U!!@3rW9DFIht`k35ScZ89~Guu
zS7m>5N^=Emd4*<aHmM4_JFhGGT|D+1vEh|31Wk56zG0J&7B_Y}v4)3hNceo#LZVQ|
zjgWIJa}!zm`rk^81X&2c)yb+E8WcnbVmvcZQ493I{)ZY=Wa3#P-<(cBIb{L*Omv)B
zwxn#DsgS$t>t`K^`sfx%fN=#Dj@L*sCm?|0q4>{~86cGb@iJ%#Esu;G0MWN_954=}
z!jlxAdAWf_hxW#rkB`sU`O0RgaR5KS-+yjlK}TCVIXQW2V`CrMlG0-chz7@yX5h2_
z{v8WoaOX<D6@Duj>>;TZZm(#E3a4P@-(4Ooc)Ky)HWlW8|EBd?$r<o$2zqQmn;Xi2
zFvs1q)!*MkxP79&J^=_Q%qM%rU#$p1u;jAn!E4gh_uB0Ik01xJ<L=Zd7grY?+!Z`h
zdT68&%A!#>vZno6yr7`$P<P7UW~H8PZEs&&vz<{}ZZ3w8+E73AsS?{2e>=6f#HsC|
z&t=-w8^M_LG9tq-NSJN^F@o_qdyrAH3OP&Kyx#l5^ksjUI5f`mUxUw|_?9kobb?;P
zKkeY$zTPZ9-wMsp`eeQXKQ>a{M{){w(LFXWzzy}6`pPC+vZiI;BTR`5C#GFU$(gRV
z*Aq&x!lIa?LIR#na&n{15-$=GlBK03J3G5lj4vJ_Dx;yH0Z^fKjRnH6%K?a7uc&Bm
z1P?>VX@Az%xgIb~!b!OC2?)UZ0cN0j2Mcu|fPJ99A2=Y00zLs3I}oXJ5TjSKe1ihs
zHnhnZbCDHnB?wIGkKGG%b3|IabmV4<OAYp>Q~AmtIm=&2c<I$dfpe}+?>+9$uyjSD
zN>TwBfYCgS!O8gc4i6u9r|E&E10!#?-z+h7sC&974OmdvjYFJORQ~)B$jhc09{Zq;
z@wmqt^YdqTo}vLM-|rXKE^VIiL_~Ai+7?KdlBB%2Hygp?&DPcgHCE=;MtvyoFeNNZ
ze0+VNBnl^qEY*E(1zmimD>XPceZ#c}*zrX%D2SRd8@HX*s-?(L)V0CgNi=o})c4iT
zqF*WF#{FiB&J)fX;u8CO!Kt;ntZ96gDP<VEgxuEm$^QLV&1$SWr;ECD5|KUU!|_&(
ztn<hWxB2-pbeHo@J>Zc&35JG#7@wu)Mo6?0=%RO2box++3&yOF0>O}jPzp}MSsBED
z5FqIEmEF2Ysu8GFb9Sv9j+4hq5!~y6!%04c3*ms$TlfpecEt~!1~3QUu->1mLMr|R
z5SZoSnws1I9#DN||Me^IGEq#L{QP~|6djp`kAqVQ5Huzu$r6&15kTUmr*{u*0-Kfr
zfq}r20=7M%AA<kW0^rABp@<9tGQr5;pan3jRf_CCfqTmoe()y^ZJyq^e^~??&`z|^
z%bVM49?Uh@77Qs=IILB_*i>>l`Jr*Spr=Ld2^}yu`g;0-nOXjLe5nynz%o8E8iz`#
z?MqJx(dg*Pu)g!nY1eB-8@)D9r(Zt-k6M2+H(VR^P{*?f(9zE2<jyh4bscZd<-H!u
zv$NkYsGDExrgd~gMUwk-m}eqgf{_7qa|mvFTAhPa9VmFO_CIhL57H6p%CCa<OS}VL
zj1-tp^2R`CwG_JvO!Bt(h|E-#{D!vdWL=V7cfrVw(|S7foS(wLY*sXVY~43GS*B+~
z$Q{P*w=I+~=O;e~<~3hJ7v2|m*OBwoI~L8~O8Tv1Njarzp3<uv&9CyljJI=c3Czzn
zMNV-3DyZF{r~%xie>Xo#&Oo&!7E`Fe17%X)o1X={jyiPEi;#IL==Z@>;FtsqBI~(I
z<F)Q!pb7+A-V)&Ogolp+?|%fjz~K7H{{H?qY9MxV0lp8=oqG0`ADkTEE&cfi>~N<|
zuuyVbTzK*)o*Er4201?=zvuPwYN2`&a8|>sk?w=wY{X_)-~<KyWMF%WlgWu7V$YRL
zU;)4p7{f>48navP_3rYP2gcs!mKKpul$4b10hCcv>gssfSCQ8ZkTUo49&Rqw)zCwE
zo29HMm7Ggm)9yPB;-<$PXIw<utq}6Lln=WahaL~9A;H|8+o-50As6Qy*##3F10h?R
zc%O|oEzI&cY>MH2RL)re@pl$nDe39`9S1tPIy$<1G%g@G((pC4PgMuk)^bJmeX5eJ
zsp;U&7j6gXcA*s&Ftj_i;<B=HG&H)=)X1=ZseowvW2Ce7Px4T1g`ThMC{8Q_@L`SB
zK?|*#DQpT|@sUeYeWiQUGvi<qdtCpQXOC1<e!keYSW~6{;heYSX!ej@Qy%9Op-F>u
zRBC`)x<bP!JPt|E-2~qKCN(P)(3I^+`aC^DuM*;qX&WNGetF3(D<mmG-@Y*;e*qLj
z%zTO8sSVpcS10DT{-vk^RMY>u;%}VzQC4J+%VoVHMr;1p<>&=%5NDV*fiEk<+yIfQ
zsQ9xfh1<5(aSvZQ8ko{|c6JB@fRUe!=PNuNn;BYtef@9>_t^aWZCTqxBk&PE9-#EM
z3qAgYvr6h({gwi7nTG(vEE!1-9Ju_}^VNVM37qw{2!fBGX81Vc;^G2MP5^zcsjhzV
zF#-bb3mlwpyRv;_3~-xIjwx`b04Aca^><qJK(YlkeF?9RHL+md4Y)3Ld_D&&NW@<2
zAEz&XcXZX`)4tun#4v~-Ej!|P9#=43e})oLCs#W(VKdY|_?0SZ;?Y=MTBQ*yMv%@+
zE=rKL6&k#sb1G<EZH4(ZC?MYPd5H7HDe6h?D{wU4qdfvN&$^wP*j4J>k>kG7`DLQz
zi6`onqMqOb(4bgGvh2M5*`j)6EU!a!O={p|vP|PO<BRmt#_i(z;0IoyVv9gxTJj$6
zuw`G4<7?+M#pXM-V&AnyI}))35k~bX^=7ff8-DM7CJkky13kx$?$NWy?wg}iWDAR2
zA^QvJxx`CXOx&ZdPg*UY0SE*S#t5ACO}O~@J|rG{!01_H5f>F@w=+TWvjZGw0I?%x
zWyKUT056EfQJ<er_x3H0RDD6g4%iVpaLWHS@d0+l0Kgdiac~80Phh71j0&87I4@qn
z`85OdI3gnAM`=St%I_}a7zsnnfHFVCs#_0|B!qKRyf1z6UuhYdnQ7_jx`Vy@Sm?_U
zcsOv%n&(IwvY-)K2BiA%YXYSYK>mpkRF|?~80~(=2*%Rp)nVZyCB?TugMbg(6?6gk
zn|Q3m)rCVp6<Q><4-7n}GT)Qrp{ssFFSeV-x2G9(?6XRtzn|+NoymaKPToe^lmGY;
z%~P=SH#{%ifCpbaQOD|ti+7mSXCTv7;AnHRyyjhG%lhejG%S~D(hk?Kj=ZeP!FW#3
z4b6%c2kpiDIA2pd!q|E8l%rDL$eDB9fgzXm4diJ;g8lRpg(L8D;`zmXhROJL$*GW`
zW*yCjYl|!na$g>*`YTz;cq%ohDOZp(h1JyKOh<Lut!5K{{W1z+8S<JkIziOf=N;B*
zCvbXc!Sm7mD#BylezrNGq6|*J=d2A^>)fhpe}z(?9%w3Nj?Lj{naEFg4-e$_a&*Q@
z8eGqRr&Y0mTj~X=&9u>H0^ac2c~4noLAO4MvNd-u+3-z^^_*!fk9)Srzx$$Jm>O|B
zLMIRR`i|mXE{z+Z9XtI$pJ78od!QhaMh&HMO1DoqhoOrd%+6i`0p?Y@F}U!c60oQ_
z(7$?I`|}5~(T^>Wu=dnMgfm0ih2p(8EPFkwv;B?zxH+toOHaF$aGMY6B5qlP&jP18
zc40@lH9O|A!jPpQZ&GHzSVu{{*%P?L?Y?d!bYbBB<+kwo$Vdv!YY$TXf~#7IJCw>$
zXFVMQ-FA}#hLRukH(h&bkxOSOb==r7)Bza{9}s#PBye$G1P7(r1TwkSrZkwqif{#4
zfrs%wr}_wUsfktlo`^mac1Ai569qEDlz<EvWC&V_rYBdJUz`o(u%07RQV$%tzoHAF
zLKd@u@qXnVP2HxCWL(uiieaFs=}TP{O)VWA94v~!MNM|df*b0Oey|{-JYq7^=zK8$
zczvSuua{Z@M)Wc`;*j=+Ka3|#>c10j#NuL;5+W9pn?6<6{@9u}Qf8Y;XL_Ex%{5Gt
z@yY>FUQcHPYVBZ;YQ#p-`gQ;E;<!ol$Is>2A2EyX18&op2bq%&tj9o13d1FXpi5of
zD%7aBZ@7x$dBwg~v=9nH(A79|UUDMWuH}rF9M|!R1#AJN4KNvc?iMt`L$kmGz51Xh
zd9g2ASco9`QnRm<y66T7^cBXH_3zeED)TM6S03~!5=iLfx%&-YJ~dS>;HFTJPq+FG
zm~FvfmM2Fv`k8^^(S#dU_VGU&RQ#x@3*oPNjE9Vsz~77$Vpa^LYr_Ei{FLwCWts3N
zM@tQgG?)hsYdJ})Q@9SMN_4#5^)+dK4|Y}?pwCb`Uowcr8i*vr&3!9IN3Ddykd!DH
zEnMD^!#@B=n<kmJWlGQhdiC5hCR4g3D&o(q@lXBR2?=E!q<3va!#>fK#UlMsQ9=nE
zi8;WxlFCu_hh423#r*6$kof|he&so_)P-2i+Rn?jWXu3Y1Uhd|9ViSSHT99|zWYRr
zm)HID>33;Co9^W+=*b+VAU1m|4en9laZ&!(bqY7V<qWx;>=3LzdhBesxYy3a05LJI
zaG12pw@i)#uF0h(kIYQ+zCHud@37KV==2}$%afJpeyYS}8%D}FCM!gf;!12%rmhbR
z99diI>D0W%3GVmvn~5YRV|!!v*AeT&6*J62KMnpGiOET?c`u%o9yfR(U4N(Y*_-7l
zL|k0F=MoD9WaYIx93OQt1vQVSd4*;RMa#IT-;fTC4jXXyVE!wAz$e?q0?!s1>@eA-
zV92abkiL2UPG^`vAobToBX*IB8)%zk!0vQ^<F^!-{Gna`@oamHKv@mFItiftf0ytb
zpYQN_c#yqJzX>N!a&;|@3<(JhZ|Ppmc6B{GKTb3>oBcbPw=t4}nU}YX_7ZCJyEvGD
zWl>5&!E#=0yFW@602^9cQGs_m+bJAyE`Zw>ppc@>&FumLx}eQGla)Eax|J0HAaHk=
z`puhPJv|cRIp6+*hzu}?dv*fdWHarEt;jqI;f~nE$KN@(2W0Gja)ei4Cq*Y<Q=#j>
z)=m9pG|bCsm>~MyT@26OF}c<1-RW4*oekohf(gB%-@6)MJ?I?@c%!*51vE<`TmM^c
z0hbSKTM8>(I(4Hv`*Q$FDhPS_yAgudbO`lue|*>khzf%i;uj=$iY6usj}K>giZ92@
z47Q`FmLgwXd7N&9r=+9@_DzusFaeWdmDL=FqT*}#RVhr0fgKM3*XOjn#3cM>JoZB!
zo%JaPZ&&>HddtzMiO(y~@pZixC9GH<6*^=QcTPJvGcaE!jPSMyd{!y(9X<QDqd-f;
zwKU<L>p#ZpN=ip_FHFKKR=?aVw}u#*c`8erzMlcE*f5~cxJmwAcaxMuspOlSuhhpm
zlkZIhPQX{3lu!g@6$Y}mH+SEha^L#6KjOl2TdFXvA)_jz=!?BHaC{}%{S!OWuSioe
z$g@E;T9#55+b09rY@Y5DxVuWy#a*KyM-Q{CdXuTs_oW(Q$-=IvIK=L#p%tyrvYwWv
z%jlQN%@oMqii*^Cc4GGyjW`Aj;)9vnrVvGR_RqKmti3RZy~M^g4orlKcy@gfN?e5I
zIa)q)9D(h@6CTx?@1Zw*30%9FndA<BeKPz~eg$ONK(=6IRlL5YuUEfPmSsdpczp#P
zh^!2Gx`3N;FWM2GGe&Q(AW+@N%J4>HqVYBPo0#;HaJy0>>$oDvj<8%@2|2BGhttYC
zV327WfBXfH!KwMw9y7LYwoORIK@k^1HKI+!K6%_0$R}!bv!^R#%PxCp%2*t%q=9Yy
zrrJ3@G5Ni|&f&!utkek8=CRx3(-+|)wkMkG#`5urR#-;kcEmp(1np%4CH^(ivWkm!
zwY6`Dra^J=pAr!Zg801U9%Fmqd!Bza%{uWrN-$DZp<1(!d^J5QmmKj*M9B5PUfQsK
z=!U6LgKQTG2b+?NhN@4)ZG153qzo%C52aHD!Q!0?o#+<fWFTj*Mq$yJwbQ5V15;Ue
zO5%X&U_TaIKT1ku-T<g05wxK|0in({%4^SRkl0vPcD$#IR5x?>H$;N3#Je&N5C2F;
z)wG+BH`K^&CySPBX#IsURoF;Pp-RGi&|qlYD0uJh&Xe5EwpuR__>OOG@bNEUU}V7~
zuvv1=RxdK4jQZ2z`-)iJ<zkmiP*5c6{Z)oafodKG39rNFk1Ib`R_sqs-g^f=2~BLE
z6TlrJ*9>}K%Dg`L@$tDyDT)quN%dyqYCqqEJnUNnO3Cy9YgY15yIol9RCzR~luztp
zK3)4{PySSnbOm8gc04uWv_JJ%Zz;MPHsmgvEA|SlQ`MIgluBqh{l8yh(-_}wA-Ltg
zJH&sJ2ZsH%sTtyJE^E-kxcBF1j#5!ls)`<St^R2EEs{-Zc<^8=eVVj_W{@pt2HiKE
zT`w1GcNlASZ^>!3B3uGXS5DttY&|JUek0P8l@8o`C8ng*?2+ILdN)|Bus7Nx_3w0}
zaZd-OlXCR}ZKkQ|e;E=gnzsb1q@@k5y-4@}WbV#MoILh=J6TP;ImSMO*M@Hh8;c`M
z<+Y&PF;sXV3Z>g^wR|)H5AP^J<wPSx-QivvR#WR8E|{J>zn%3M&8B0s!_dSP$}oOU
zh7jaz_UFSZXqkT6#IIj#UqZse0+_Ww12TT|P(oI&+=+r58hB%-Mp7!q$D`uoYr$ol
z-@&YFAcmkX@~YXx%^+Ry|J%~3vzh}bGU_WN@3FKY*AfhPU`PPSs4nfhbvAk&*2!tr
zQs_wF(MU^lr{s!zJscY^j!@I!<&byr7VQ)o2oO&%)!G|TwGdR;wz`k*@a7wxZ@%-z
zmybSd!$JvqSZ_4nvxFH@kR5IIT8;YtdY`{NQ=Xt|meCHLf@P2q=ZpV=Tdkf^8<AE|
zF=tG%HIsL#(YyGRs3Sr5gLkG2rDD8MlsK~jUyfv!s&@fw{@?Aa9u>4I#?0rnvd{VK
zo3Tk5ndm0|us<IKO?}z1D*8zKhHOGGgfp@i9j$EILB=|&7~$J{$l!;@U~gz$;JIM<
z>_3iM3O7f#e$~jZe4JA((Z<GhXEtH)(`YKvYmUr2;rs?c?dR!rNEr-u4z$L0lx;@v
zON{t*g+J|evW1KqC~^|NZ6j%ZR7Gzg*Br#?`nFU8Nh0X#WPN*UbSW_!r=X<dxnLs+
zH~DaK4PaVsY~h#c@=W#^gS<dXph^VLB*%mKqz>OSU!Ue&#g{c^<6BwpS&0b$BqtIA
z?;dEMr@+8`YM<C~*i<iqeR$BJTn#7Lqd-nhye36<*PbVM^#<hk@HtwLg!cG|jo>K=
z;M+FhFz?EpL*oJu^xg}Fy2Ql9S8bKwHSTx{hiD)%f34A*87<sQb&RlgWm4AAB;u%;
znf|pv7jRI&pLUh=WvAZ}?)5n$cg-Dvp|k#nHW%oRZY0*bc}Jtdw@6_}6`qAcB3Nf!
zat9`p!PHVT#wO*qkd2O_w7aBjlP!WRe9Uy7ey71_(bT(A=;u2BDFOjGwf`}~;~!mH
zlw<8s8l1u>Ze+5+y9Ajq+yymjOYAHBI>IZ2IV@ou3C%p%6uF=Ngx_$Hp)<O|g5n2s
z`~KNT1k(+alrr=3K(HMB#Jdd_gVqg8GdgZ_*x{s3ld1+??+A=vU*FQu$m{8WB%&B<
zgqK1lpG^sp6FJ7Gr&qtcv*LQMsR?Jrf^{tDIsBG$y6?>g5?n?W7HNeCsVrr1)yh>V
za5R&Y=LDBMUZ71?RT}ttLPPaC`2z^;YO-~LlDSdY&0JjYBq0p-TK$@W%c~&#Q`cBq
zIJCm)A^BlUd+)o$QG;swCLc>_cmr|5^~!W92}!BH1j3!)QB&2;_p6HP*kQ7S->2`~
z%Lgb^I4zP3LG4aI$cQ)-TFGm9UrpvbD$#V1I0Ga+^kafnKk8=g@-1~_*)x7V^c@8L
zw<-i6HMJx6f9T+y_yW!Kl)?$ElpRbUW_PGcLhr5ge8esqZ-lx-EkEv2`jeX>3DOn~
zM$tw}6r1LCLwl-oICgWQYK`B&KSWA9OUMuy8V}{zo6~rGp1WN>RKn+0IG>E3(yR^-
z<JHo+0{`rjRECXoWQ0za-@!2}w^3eaH?LJbeM7J?C7cH9l=D*OP_ows+2FUmc*5Ue
z%9&64U8Hy6&O}iU6qYJFua{p8vHZ1P?;08-bFFc^I`8@x++ymgpv`NeXZL9*A{5%=
zLIb1=|3l@G@XZKB`EZ(}rgtqnd)VuHwkBfnT^WoU`Nd`RX?Y7vAqO|AV5_75kNy$Z
znft(6k+;Z$aAOhsKv18nh|5riOXoa`%2lHccnRPNFfe>z4iy{c82$>m62{N9u*S@?
zC?y%=XwF_qON&zy{YLMp4(-4{MIfj0$ahGNDFd@7Nx}$abI6}dcH0+%W?U+YjaQvV
z(RGE5r=1-|NDBL$Mw{J?8FekMZI``lhc3$b>J)@@_l26|U#pfkR5n%~$vt)ga{{V<
z4}w)N1oI4*q?Bi^*XRsuAkpbI?)!o%raly>LR%v3;$eClZ=H)}d3&)`Cxw8GRVOiJ
zjib4AN<tmDSpQ4R@-&+OQ(k}vN6DrLsxvSgz0Bk3aer%(EuAORYxix9`4fP}jE;`}
zQ=TNfH68@`G(5eK$NaLjaH(y`G9LlSif4ayNkw6NWvg^nTk(iS*i!X<X5EL^F%3%s
z#3XwzFYTPk>t(31E{T>7RQj2VVDLkohNa~dbl7ZSlH&`NN#J6ASmp)oCl{Aoh;H1D
ztYFth)J*n2cwV3X2(_Q#2}?B+!^vjytvgC>uX}#9@FA%s$YZiHYv2?PHC+(0!!|gx
zT_3pl43(k{+KcV?zt*)YSli3+A$73k$UvOLGQ`IsJN!E9b?ts35-|4lyEfY{Ls@Y=
zmKBw7b5u@F^Qh?`pvlpwsni{V{kC3(xL@8yQ`7K#Cz;>?L@0F<k~|S#$$$Wjb+S$8
zg1$BMpX_Ew2JWu-XUQ4jRe$C%uvI67Olo(gd?fRXb2dHQ(|)(UC%O8hZ{3&(s0TAP
zndakJ=xGB#f5z@tFPdT+*vZx(G}c&rvN`42Y5B(ZW^Y!P8rPc-yUy?G+C;_3C@ua#
z-Z(JcvAoUw{BUxC!*?ru!P!ziXEFqJ?kL@h%X%g@r%)~OEJWmgNks$z%-R;VN5`83
z{{PB-m4ti4tpCY<ukVoRpJZ%xLdd0(*ugRe%~H9O=7a$#@RW$YpDZlk=OqlvB-`xP
z=Z6xG^mOsR*&biX_~qt!sHtRHTI}B6MAm9#Oc#H+E7iN};DJtyN}NGd?aX&=P(-i{
z%`ZPjIZSW{WUbIRvGqCT*Zp(cU~GSd_#H3r5E8Ns95Rm_FeC2y=1t%(KT67ovEedN
zbdXCt`ZGK`l+Rgx+kqd9eEf{5!%pr}Ydrn;OAv?l|6Q#x!QAO#MBpisR%l7#sG*hh
zk9y%vr`}$lOrER@w7-u_O9Slc{^8=C4&S1SbBCeKf~bjS<}#u9B`qFz<CA$aSlPmb
zn{S_INWDxb(;#<7k0IY%Sgjd$yUIjD`_ky{V%p}p^pV`wUv;ME&Iz;%gw<*o$t370
zz588OWEO~gdxvXQzg~xUDmvmY;3>MLv*mPwPR{?8i7in=1qN5WEy#kdax%cA7+b=f
zGT}ARjd6SD4xxs>00;6(t`viC5<%#Ig%jx5p=>yJ>m2f59~Cj`(6zxwp~QY3h*V4)
z^tb_<*>r|;$WiOxV~frO&KHOdZ^s{R`W_iia*_Q-1q~qkvzws$6M$f4=jCN&cEYmi
zHsRl2T3f95&UXb~Wsg_$3vi1|2t<Up(BYZ@T>+TvDkLZK=H&Ful=gf!F}e6NKEBv|
z0BF-~?gP^^GcBG1lB%jBziW|vJ7Q5kMgVZ7<01d_VBtZ&atlbFoBbJ(EOt_=5BnDm
zzljA(Qlm^=d}|HVMJV>6iA=O<(nZVn6J8i-eP1W+Tgd7_t_6@x)UzEJuz8K1pY#L8
zM8_K^kqnu!6Hr_)zdfG_yard|*}I)0(wAB<2rxkTaypjcGZH+<5sSn1)0_ld@(Fv;
zqZ78`5Q`P<&O(6`MTd~V#>Rk7B|iPT9W;<RrJ`|z#kKssm?@EAC6q}Y3nQxr*U(!T
z2Uyh!->8aHzoOpu6)(kR5Ua#u!hZfJAoeB|7C*$#PwG`J<La^}QnaqB`xhtmwJ?AJ
zeq6lkJaA-4>TO5hSMH={M`gZc<$m6L-EQm!uNusj<+V+8&&-$n=h18gW#a_VphyJ*
z@hd7?-kZr&SLbYxMkl$nT5xihEho3LD=RKpKtQ@<)!DM&R?u}gPeUcZh$L@02TqG{
z5=)+U#R}5WRpn+xL@wK_T`L}U3f6O9KmYl4sK$U150B&q=8ELi?K$r|x(z!=ZOoOG
zPXI3mTmgYU<EMkm(se{3+|xb!zb*g&;jrk)S4&~RZ{e-mL~UiSe9mF^j=$CR*T8?(
z@Y(z<I&?P!`e^~R)5W6$&O(6tgfJJKc;LeNq9dx$NZScO5j0iXdQu8x<lWREN7r|n
zY&<k`q^aH{DV7*NEKNS<O5#l0gJd-Ibszw8yE-hXhoHjyc;4TJ6S9eVewyx1>FxG&
zUppHTRfzi5+fJO6Z^2=L0`Ut{%x+;}pvU^dVZ1}c82a4LhZ2D(b!tXp9&zyFK+|g%
zfY^FC3Z?DQmjA$u$Yke#_I4%}(k3-Bp;Q{oaOeCpGuGMvZ##ht6DfjoGR=Wwa-ovH
z(4l+QD_lgp1$!Ek`+*cD-jdj{nUC}rHkSK>nVPRBkIp?cLE7kfdASrf_hSO9BFJbk
zHuhlSA}7bKHdzzzyE;mDIN$}N*Ck-k=5c1RSeHz|V}xyK*`Ax9Pko2Ix>6q9)nK5e
zHeY*i{YwdOVD~qNoZr7kA|WEeB;#FJY~IY3AHwoumR&nouUZ~s<)B(WtuRWszwi1X
zz{i}h*+lVVp1P^)yE`<=EEF3lLJN@zh>GggbR<`Zo7$rM#JQxshcQ8Z#9ubB@UKxA
zSdiVl{x!*A-g#0><@4dl_Iq*hFk()baesdfho6uHRkcBvn5da~rT6|PX&BaR*`!y2
zNouJ$e2p$h)}Rx+P}iR;FQreOnHLCgp|};=cVx)h0@{am{lx(^o%F;6+F*Zi(^X0s
zX=Lezo07nHHI%`-=K0{}Lktpl0d=!viWX4xfy5)mKm`Fv=jqF`>_?AWFWA^3W4TBk
z`=Bn*3@<M1{t7PWXH;`Y`{QPY1z=hGMaKuon3)Gv2S$(UYsU_n(Z^+Gi_(ar&``Mv
z0q^5>!{(8-7Jb3p{gvyHz}2GtsK6D}-Yb1O^^ET{B=h#ShveLau@a9bD*0Bf8u949
z<sOfp^d=VlHIiwSXzgoE_5&{K@ZH=K<!9~7xjdsH=JPV@-)>CeSFgOCL0yHgInCV6
zdGLyo{5^#`*VG@<^6eA|&#GS5_kK!T%80gFp3YS}BD9CdE)%i183nSO>Bq;}g3T4?
zt2K$rq$E{`b7j0ZZ!4#g#>VivI@zzM3+N;m&sOD<KVy-5!iwzvE?sh7X_s)mLcqpm
zv#_|wcnKu}o}o8yDA}Ug6fQOfN_~;8z?T;Ota-rB?zyY$nY^3>K1($+M#SIaj6ZW#
zXM~WtCSDc@w^s!B?T*<gPuIOy>d_B>#J7ZU7U(Uiru3r(m$b@34YsE|RyJiYOLO~P
zp)JAGhJsvuI<L^0NkW9C+`A_H?TcH5rc$Nynal7&qHI92(d`RAZ|4>U3$rYN$o`<<
zdWEU<{G8M2@pPo4^STcv_6a3=Z@tFRWM@k<JL`O?SZRL0k}+C2IEtK#bVPkFQe6B!
z_dKPYM6rEZ<IUw<AAVaI-jR5%HuvYBKluGNs1rhiV${xohq-2<bwy8ExdWurJM5e;
zC-Fl8E!^$ufvlC#QxX6U(dA>e%s|j6*c7Y@CsdAUvFvCXYFf|8)!j3to$+=ibeh*x
zm-3j}DZ4+VU#=*l9Jg=eY7ov}B(3tqD)-So$39H=AgE+^$+zRyVGaLSMlfYT8yWt&
zjlp!BC&V|q`?KFWtB80P#WeE}QWN_o0C<M#;M}Z?eEPapqTQOBSQx4(t-dt2Tp}d@
zj$d(wJ}K~hN6OHr%7!&j>UE0y895@(eA9%0sFdeW5fN-gP0zy<78aJ>@h&Dhvrv#*
z05Cx37X;#xx02%B`fU$yC=JU>4^ufUO;~km|EzQv0XZSBeeY2F$NWq~Zg}j=D*!3m
zn9k0wg7DjP!fQHSJS@~UWUP2Rdh&m9lb0t;85td71OW%0=X6+bM^2jWk&a9toZHUg
zXOMT~ott0l;bF-s?RogGB|EeFyn>NoW!6rF$YEzT-K}V`D1shcJGiRtAh7x?Td=EZ
zSlG&g<b7h|C>=ef%GSoreqH0qr}LY%B-0m&C49BTrMs_}9-QBN(jVFL_jVTZa4O}p
zByKr$8=VgFn~)i9AkGz%a6iTfp0}~BwV6-6WRc}VO}2?)ex3e#3+Fkc%e#eucu(#(
zasSJ0nqM<3BkOk8wR981P-Qdx38;yDVw?NHS_2(K$o<~){zghxPyKFOEJs2D{)GVv
z+z}D`cM@(aV`FLxm=CpRCud^Gony=JP->J1>F&Ns%NPB)f15q#iF@_t<T{x2%gJU*
zG4lKm%RP%}J4nKke*OCOaDP`+aDTNa4+iwo(n*3Ix0ou{_z(g-yg3jU@;9AdbYnxX
zYx`uqPXPP?DOt?gQ?vxvp&*oU<If){+L#m<Dv;x5GWO$Oe}C~!xx`Ai>-*5zbSKth
zy-1kovr7wS_2_&?BXUQNXl#SGv*lF>wBaupe;BvXy)>Sg`K`Ln4oP<-sO0?M5!RXc
zj7-W`s#7yz;@Of=u4V&>&WPBNDf)EM{==C2X5%q0-rHG)?XMBryU3E;JtuPt2tlmo
zKuFzwDh9Zr)IZL5)oj}YqeNNmgpI1;(C*&rzTQROAKzT)AAvt&Hx(sdo_{!5m;W%Q
zfHl&!F0Ay688x&b@@3opgv8-Qju@k+O`{{DAoD>|<k;nO(baN`o057>TSEi8zvx@G
z@tgC4&gtpP-thOJO-e+hDMek`a^I_2_SVQKQdTM`>tXum>Z&S={^!03-#i7qvEeW*
zKckhUAGb{%DNJ$-ZqZQ?)(>jB5Wj}n?9}UnGd8LbPCQ(`Jb|IV=NVdk4rcD3Rjxcy
z2JFh_<Sq$O1?FrfApC-?MQ;vWVM9BJxwNfUj$ssSY3y7U3nwgq^*lFk&87;eeyPJK
zAfkX)iC-eo{miO5@=TeE<Uahht_8zm=gT`{Y((fqXM0n|o3gD$-tX5d9ydQqynE$5
z92wTrwQV;a#oN~{c7{D>$hYFvI`h7F7_Qnu4jz{DUhk3=TvEjZ8F*h6T_L219<qI~
zg3T~T{&vI>EL_OM=;}@-=kn*c)q<EGBd<B&mOt;?z#844taS}r4QWP5*NEtM`(^oz
zd}JaXoD8M!ag5l{G7p!cX0^mipCjMOugK%mrrIgGyYGn2c&_c0hQSym3~k+qbj-{9
zT!DZLATkBuI*{7~a$S)r=H})=*3QYviJ%}z24z`p;gR=owy`+`*jk_-Kw9YS?FG6a
z@DPUp$}^tn18f)r(DB`jrt+LI+J5$m068?!W*4}XrPO|<IyqWZka2A~_IUTD4!OBQ
zshBB)Oz(l3yXI^3UOWA0VU0Q(8~y2h)d$n17(bez(mZq)8@ISPty{fg{Xa!rn<pzH
zg)BzbtdQ8pT-Tf6C|`8EKQ;UD`h016=A7+S34iDuu6l|zC-F=8kLry4;<udF?GYZe
z?Wp*|>}xdouWqRlvakVD_pHxigh)V6U3v9ATM81izPVXZ@kh3NayBc3?uJ$8^XJ7P
zO<P>tjpC9%WjaHkzW@l*+}tdpu;li3%j9IZKy(sqU0p2flUJ|v+JAjX3=Z~ma#C=;
zMPg^yBUJ#Q0gL(h5FMQtY;5DKH*3E(`=HGOe~(vVDk=`!#Fvl}q1HuU8-Tpy;(utr
zo#Yygnf%ehe518yXOc6s?;(!Uai2Xj_&f#CoRN8v-!cQrU`Jn3F?o^YW4Q8Nabv4e
zakGflmMgh5a^)nlOz5Ji#&~AUmvvn~@<%QsH7&nQ<3oJI-5gWI`sIg46K#GB&2lCp
z)9_6${~updMvJ@KN$?P%`EQPU+es$V%N(~Uo%LOWsn_Nn-#B{V!?4oR%HglPP<(@5
z9MO@@u4SZl7f>%JWYBKsnP`9sm5Ad6LE3ipx+`Z}+47W80H4;juwZ&hZLj*mh&<tr
zA@Aezv=%l6MM{*#S$Or7n{y9^5WMX-K`N?;<I|YeQ5j-Ub+%DRm{?+6y=Q;Ha>S;&
z9S#|QsJvAwJDU)rs<&<l(P|+^0<l3y(?yzD3smnTn!j$Md_T^Xjv)s+B1E7*cAG9#
zUrt@%fHXBW0_|>_=ZxGdpvMK^NDgsbUD8?4Uq40KHY&Ze0nW(Bb>$Ar)+zD|=%fLc
z9qg3|D<fjaxbDytne-18!c%7+0hsAizS=#Mq2DWGWd7oJSL~!tcVw|1xWvUBiyd!$
zx#?mrlHEbV?}35vk%1$2I&=KRJJPy{-Vtf;PWIbZ@3_bn=@^)UJ#G#Cw!R<gZQS^>
zJs3bpz1m+}DJ3y^#6hdxTh_iRYa?uq6c-4W9-24f(a3qf-D|{!*^J#WxbSQAEs5~k
z9aIMPw!fF&1Xk+^3#1)Foe4Be*dn)oSX9eHOOHm(%Tu*lc4Kby!0kt;(C)Z*^~SeX
zQ~q4#_n)SnsX{#k7{#Mz<oc?b>hjJJ!<1qlw1knRFB1(h&Dk-*fgRbasCBa~VbVHp
zP0|#zXJT}uSzqQLEG~I+zq{=Iz0$?T7h#JsI>fN(yc8;8Dm5VUE(IbrC<95;`Jk@h
z%jFfF)~z?f-;*ugGFsaCzt<4^Rg5TkUVC173+lwE&rE!eiO|?+nNj&Mg!C6x+(7IL
z0#4FJw~F9VWMheRO=HG26i<_gtEQ`$VFzT~OnPsYlONY!x3Tg&3XMa*&^7X6x)0k4
z5n7BIQjl;{HaSti!1Pb#1m)0xWl-tFTxs-vo#?^lSc90Q>2JMf3hT07nPhv)hzZb<
zA|XARe%{^Lv1?d#-9`MMt^I_fPKj8~?YLX-i?rVw4vvT@#q3x=(LFRpH2^`!V`H|z
zi$8c-MLL7|=qFX73HKbZ>Odk|(<iCW_7pn5G5MnNA2;8Cs{U&H_uD;@qEAuuem>6A
zZU;t0HB5q#qua*5yN(8Df(6{>Re{n{DzbM)i3Y-w$Y0vNe5O>^(n`d@XRcDl%3?o|
za~Nf*v+>mTVup0x>GpjNdCu9gOL+Pc(iWh3>V~#R+j(!~F;fy#oS7L=dZ>lgydWO#
z%k3S+%AD`OwlJHM*PN+Rw>a!>IYaNdrTuJ!oZYEUl7Gp`uH+$mbyFg)bnC}fRj-19
zAe&HLzti{i%kMQaa7SCs5APhNe%7(thgqR*H_esS&g<%ZsC(?CePyjIt^Co7{)1ha
zeNoYn<*)tj9C5u7Zt4D@Pb+#aBfPm8(3;tfd;JQG%&Ysx{hVLkbm^X42Y0E(N6e5F
zOrQ#E+*d*zbty#KY2^2W&bJd@yl`)B%LqF1b(TOdv<jnsO%)s*luZ2fmgB7{MrLV5
zv-=z*pv^Y_W-eUXj^c>V@x_Oiypmcz!zN!=2itd7FiFh@Ex(>kLn56&d>|^n0$EON
zZigVDptGwBQ0m)wCW?8WAcpJ(t(;lVNxl?ybXgfNz!~Z2t(RdX(tWvM=H1LoEqd<S
zd^X8=H81e-S3SmEwPPN`Z;iJDlz#i$#-12<OvG-}{^OcXaFB%X<ms8;zL?O@6o2o}
zJ`mJV7ll9YPc>V+Ht#T5bl@sOg(2?9@S>xAe}CyX=MhF8F3U#sJgl}m(x$Ly*LA{=
zcr*9!*&-a_56sMr_4G8<U$X>uZPhNNL>A;k$HnbV_Wd8O-a0JG?cE=yySt<t1f;u>
z5Ky{H5Rj0Tkxpq)k&;en5Tu9h4k^hYBn5_U_$|NZocEmT{d4c_=E9lvtmnQzX=<^`
zEGk;uI((A-pb!7}@<~jmslW$n)ySXrwPp<gSj<GhvW#fAVRp=|`_o38*KRj$j#)%g
zuQgez<0{U#-~Eijia$^NvF-Zzu!Y<@k5|V1*oaTIRwi7XERk$cIxz6fhl+8p@x5qc
zX-K-Sno9JC3?n)4k^HuT#RZFm1YbU4BfF@lisx}UyzNU**d{ZG*W2F~SG6d!xcsP`
zP>|%T!k$Oq*3fsUeQ;)yLhO4*kS6+BCjS;D_we_SocB|j03inlGbh&Mlmm13>NarG
zRoq!uzO?ck$#}4hL61~TG{Sl3g^Am_VM#sroVYDLW5dkM>;MLH-JL3|>H-A_@lPaJ
zIXJFc6Tj`xKyz|Brl$p2SPHFRW0?}mOG^uzdjXlCPzS`Be*<e?r{zCkv4usumF4g@
z<X6JegvU?DZ=3Cd!qYiJhsX%omc`O2|DJ0Xl}(hM=JPrSUaxDe6{q<h)Z-QE{Dz=-
zkIFnffu~y)EtB2s@)gra9@}+ND_R>EbGBR(-a>5@Uc1-)Sx?%&JfaXYEmPVR<iLSm
zi@U-)fN^G_g(OvacxB}Qh@=ky^ncb6A`;0cA3$;3<2@7g;NZ_J9_6L?SJKeC!)N%3
zuM33T@ysNry~recEM2q8RQMfoMDG(JjmxcF`_X3(Tz=%O<0Br~Z8~b2H4SR`^jCjY
zJiK|tU+KRJV7FxpYKTmI)zPezi_S#;E=}l7FlO^Nk)X@h3L==4$IDiS6p;M?x@9RK
z)($0k(zD6Gj3&ytB8il}NFT3Jo(WmZ2;QtjMzAV;eU9^r1O-loWXO<oo%=DWe+}&g
zcFd=5LRGh1byUy>7iD3mD@`X}-k$FK&p3g^5Ny^)H+=EFR)2l07Ubsx=aZi*%50!7
z*k||-ZJS@EEk0M!Os~x_1f^#`ez*aBuYY85vAwBj2B<H^AG7Cy0DK1IOkM#C5-zf{
zlT(los4-sH!g^%R-Cv$Qs&^85MzeYhN%eL(OY_cbju~0W{cWProcHOyQ}3^YsxllW
zpK&;q&+D0AVU8b`lUTMsIC7mOLb8nme4N+6moG#O>lRh$l<^c1Uzv{vl0i*2XYTB#
ztvb^C|2%8FJEv%r43iJKo&Hw;S6SW{nxBvV!!YAWAA>f5?ejzW7eqJ1@+#Hmt}(|U
zpZ6-t^;Cit8E0g;OnFj9ZM{F;{u2I@Oa|o0X;Fg(IXR~?*~TWjyl5^2N|ARw2MT+M
zI1CImeZ`&dkUww8ai{iPk};C|bC0S%iYf+9V4Z3p*WGem9I-Ce{UT@$CedP$L^L?d
z6=acat^8IhEMM6EJX?gfhQc#>Xd=LgjM`Y+7*}d-1v}h4GwI`*QY|aNG39uz3bVXS
z-O5gg&0~B3EwqVdEO=*Q?h9jYxMCE9F|oU}a883x`b}qQ(<pa$e~o0qu*vyX<uN*;
zvd{K2(|cYF=|c`&V^5{knCtItGyzP41s?E<<eS0~1EstEzUqki&_Vyj7SYF4$fzk*
zd6L5uh(MZOwu3<xV<;NKmoJtIjEan35<@q)nevsH8eBrduk-(%iqN3XYSPnThZ?@O
z<*E9oA5eJXgKCdo(Y>8aKiY&ooyLIVe>F5T6a^JoU^S?UM~Vg?BNP-YT68k|ICym2
zc7<c(;o*6Ct?rto@>z8OEsM>Y#Uj>6!#c<l7b3i%>*CYIsRS?i*C-qxCKs@bol#%Z
zf8RmMUc3J|#QHoy^2Z5fVjtm=BH0nMsh)gUnI-2Lv4x*yn5j1*St2?I0_0XglGD#%
z!_%o?j~#V^7yamwR+p!PqYYJk!K&ldO0J@_E<=+<o-#Aj=69fd*`3~<%I)QCrQurF
zAKQnHCaU=1iWeHAX#BxLsGDR&jOT~TFJO=5RyJy$XUji-I<c`SH-=VlscLGHMIgIK
zJU)+(#sWeED~hVP=@J88uP2$$=4p<E3F)8)&3(01h&Ye-c&L(2LE)mEAxj-Ha4kjx
zO!z0{;BGXU(vngSn!5LUNIHuwI=iOg)QoP8q5&qV9f*M&S1-QG`vowHhV^a+D!bjS
zDL^7SvYQd{f708v)duhru22N5Kc=eKtoM6g&mz;bqaLNTk>{DW;d*|1S5mg1g7qo^
z{fMUJg0Uv22EP2ACeFNcy^AQ>I58c6W%q+itoO^B*}9YZ<(p*;b>aQlvSPD2zpQQM
z7|f_Gbek_xqsBS~O8@rK2Q~OsfSVq-AysYDmI^5F_49)^HeOBSDzGp!FWfRNXoG&Q
zvBAO5pviZ3_T84a)ad5`h`u}r?t)k1VcuNd<-4P@p6O(8zS!89Zf@>eeEcN&UFqqd
zYxfd9ByuV7nlCocA7tCXh>Pf3Ym~9{b*OxkpDc-qS?+HMW1*<ZN7?6)rPXKCk^%xA
zz-zLTXR)egVIjlESN$?udX?ra6?8mXM#g1Da*N#Cl%Jm;l<@gH+<ov51E>hHFL0=Y
z4R7OBB!`G}WrxZ_Qgn$voM~K~YNI775VDl+3{j67?B%nL+Md7_?|#AY6XaL7i}b6F
zOrLSM(|M&G$LRT#s(GYZ)-gg5A#@%A-x5s^{Y+mfS!BumsrOX9A_JHQ=7ADNQs49P
zTeYDW0^>6Y(+I?))Q|-&E2BL59&sv(LF2c@rqrR{`R1dlpzQW3_iPZUma*|lg^Asp
zKUw_=zMhBNkG+LrjPN1W^}m0;9v^BAs`3a5kmLp|{J!LmjT6e*i7HpWAaKfWh(2Z^
z575TK+K~3=RJiPq`xO-}&d&a_zfswlVWjMX$<X-t`0w9Ffm5yz#uybok+6K!=blr`
zHsNiVq)E;9j#4wuERwX!-}#C26Wfto&|+$LCX!;@1`)BpkI*fae^9)2d1CtE-GS6)
zdOq4tE(5**t%Yi2&eVQiX?#|t6lw&U<cAbj&aLtTjN$!pm$_&AwsAk7<O)jG!YY27
z5ASw=cg{YfOrdPQv<>QJ_pN1MJ?Ch@|5{kD>8&X9G^g+<^vrbClW2D1J_hw@4zWS$
zg{_HuJNgO)DHmE(`fG>s2{sfITLBjd7B=?39=E#Ry`V6uuDUuX4wzT>_v())+TN3-
z4A<4xf{P?MDM>?969n%&niR$cUs{yU277x7u5W!TE@h0sJDxj!etygpD0K9d<YDji
zqYP<7ZU7gAX@~bg9wnN9klTVbVYuj&!t*`#x8^usp={6}2?~<-6fDVCe!=1J8v-4j
zUmy-zA0qV6UI?zPNoS%c&z%Vdd*obH(6^<Ce13M8Beb*9DsggIN(#ee#3oGje1FYG
zmEOsMNmxJ{7Tjkf7(5!K5T=M(_h)42wRlQ*ek1|&X4pVi&$u$`x?9s9CBf%Ulq*}<
zjZ!DDemWCEd|Akx)!3g?{1v4xBs5br`KkrZ(|m|{H=JA5>NRQH{ZeUrp9`uBXqS7<
zN3lJ=dIA?EBKMZp)O@b}rTydrl!AkB_naJh5vvf|w57pO=fzNT%$M0KmOr8^EfSsx
zIi&vlNt0y!gp}06(6AJcq@|@_GVdRLeV=-_ya^m#oKK%xy{lwnW{$;srF^y0``p)u
zCqvj7SWdnsCx87y^q5jiE!XM0iIESJ!Gtm;@)sS5%@KAku8s#bot{*-il4UBnB3BL
zddDcTx{1*UQ|BM*W9@zFUnAR2@kd>ue@stEp^K@e7CI~Us7=nFck?>>_({XS;G_Bk
z1%8V`;yWJcCTN473#C>`576Q_CriI?;gM{!*_U{cpIxM8P#Bg$u1>b`dHV@#x^s==
z)Ci&Qi&&bz!wh;3X3BNHqB7l-o3CYWg<N84ZiX(+U7d{7`||xB)p4MKb*`d!-;5IN
zG-#9y1=^(4m|<L{4spqi7EMIr&c@0}E?{lA3VR~u;|ZFnM<w@xV;^i0JJTh<Doh##
zg-ohh_vm=|e*A0e(whWUXV9%vZ}pNKZ6)xBu@sm~Vfeh{4}r*OYJV;RurV1>kWND1
zb*$p9V1)PynAAS)KNMNnctSzZ2psL;y)j>3Uk{pe=?O4E->|%Y1!zgUzPVXgTnseX
z1lSM4Lr)Yj5Su_GX^GNg0;Rj?hQ4h8TgUB%v$eG~{b`oe<H@+`eh0O>E)5|Af!j`j
z8D3)lhH$ajgBDG&%pY?6Poq|+kc_I9v?!YgdM@G6ZW$1#IRL7SD^!)zzeJ1Sj3w@>
zbxXwjWM={cu0x;rimR^@k2#Kq*K*2dLQVz-RJq0aCl?pe5&?IS;H2&B;YoWrJTBCk
zSidIP*s!j&kdgA*+gn<q#=^?99X$cRTK+fVN5{F>bTZOjSMWdJEXLBwd{ODKXt{u&
zy?yC>yNe;OkWEkc`LnSxa&K=qCd$-oMU1W<I#%@AQLzqh@0zi=HLn>?-qd5H&`59t
zuZ%|80Xd`M9t1W@=g>BEPPxrWZCtXtUugj&4t|W$>y}0OEZg}R6C*eRZio>xXm!7(
zU6&Hc{P4<S_yHTd7gNNppPRGUt0&K$xec?$(u>qV?G)f3?^eg()`kSCx_W9F#^}_v
z%EKunzPLTp9ozrw-VS|arN#a)6n!`0dCNYXtE8}&^<85oaU6OAM=7WO>8?icaJckI
z^lH(#jX=fy0bT0{V}hXIOWu@_kS+r}NLv{&(*gf#8|f}NutB1t=Ke)z2Jyy!_Vye)
z7C>oRAn7Rp5CCGDl#~Pr<GGoc3OZZN_ckgjDxg&q9E-NL2hcuEAM*M0WP&>mD0Yd9
zlY+qMZA6?G1cB=n8ynkF^UNOfuc@o!+$DYbL?P^CsL=612K_ZP1?vKa>3<{lCPM)f
zDhb|+yu2>~;5GCMe%8~YDEb4mZ+dKxfr~TXY=<!Jd8l{AHBR>D&&OBC8|&+@*%u9>
z^5qjN%F04p&`7WJvh{p?Zh%f1wDti;0sO+iQZ`ZOLyhPX`M5bq+}HtXdMT)=>VFE*
zARo0GR=lsAHLgR3JZcN=njV0nI?xOcDyu`uh`v3+2=)#HsPBZ|6KTKsF%ag;CN$Ty
z@UL+B%BaongZ}^!U4SBBgsX~zg6S2n)8QF4`17l)UWLb_miBHz$!Ndt>!_l`j_16h
z;1D~jDw&VR7^xo;4?h4F@%bo%3jvF$D0sH+t1K+OBp>Q(pQviNtq&&B3JcfP)LQK9
zi~v)xCFCl!%zzGL2>Bf>1Zkv`X{K7fzjjZJ8%5(h*V52f?g*aU9(NfV=QV0L?AYjM
z14Xq!>BDCec`}yiNLWDTAIbgH;Bs%qYv`*M1A{kxT~Chy*iAza_Z}l{bkP!SnI*~}
zBPY4hT3fNZ$!QbH@@v&f_h1hQVcwGPcSB87oHVrkF4JuzBjpLP?>4SNJ5g{M4j=Uw
zHnYP8F*^Mp5@b8Rvp_4JR#942_U8CMAfmqPF(5No7WF^!_u(;$l$<g(vAcUkQ`Idf
zCzDSnW}x8pcTC_iXHk&MQP!X-%YCZlulg<L-9*n@j1^KZ(q{ALW`*L)Rohj{K0Aj$
zSR%**L{FY^Y=<xBUMEN%*|V401^9f2`gQc}W?}b_i2N2Zdag?)Z@c^3XMFy-9lw5t
z#qH*@0A7l<$dpeZ<)g9>RpL8eOHCTIm3AeN6@lZB`LlrHvSrK<)y2Z?!^}+C9IIx=
zMD$s(`{ed$Z|@&q{eJT%d;3|i5ugTDzjVwHWomkRcp&TA1{#%;L=wspVvyB$`k}}J
z`3K1Tz)?gf^90SVpyc&dD<0thArV&r92)^gveoSDhK9@JrzSynlded?2nhJ9xs@OH
zI{)P)e6p64L;QLNRN1n(Ai(qEk)^9Ee5&xpq$Z~>sAT@GnMp);b_fCjK#ZH2u`yNb
z?v2u$HKhCdd-7ZX(?ITGzv6#&b~kBh^uT^vO#Cw%p&rl{0YLG0gW>M(ZmHn=;X`f<
zczD8JI6|7geKS^3!63Nu?xeQ#1)Zg!*S^H%zNe>$KE9&D`P796ycDYU{HyAwFsad2
zWJ<OmLJ_m&U=wQfeY`x}+}Z;Dt1BHHFmyXXawwr(*-27FjElm9ejP1zQLh5g1?@5l
zAn+C1!<{CFy)iZdwSOijBXo7qYJbn{?Fn2QdVRP)sc5-HES|w6KI`g!*>MSqH2fbP
z7-(s+gY)$CZfT{V0`xG5rKRF)#=y+I!9+83{KE$4MJhxHQ^*g(ypY}XYOS)o*MI)h
z+25Q+SyI(`pFau@;%E!_a}_d7DU(_~5=yN^2DKs*iJH;|d0#EG<3iV7$dWPYC(V3e
zS0r?^g4qZsY7pkS{<fSvo>#{+N<~@h$|G@jnmPJ9=0(gehqv%&#6(5Y`g3k#V4&6_
zOT^vnK#w2Yk~P<M|1#WHPI|=rgTobdb(sq9NtLFl^$#hw^U&YrlW$m(w6?7?{kL9b
z*~wQF*1>DUvjEsXF{CqxzY-wN40CjJST657$T;sl9Q_D0dr_4MbJjCAICj6amc)~J
z-1*WGA@||IB@62BK+Lzj^kqZ8`@7Qj-q>@R1F7+GW4nd<gRNgb+1x}kJQ8>^j6Hi4
zoy#wA+rKW6;3YVg+DcmYcs52*Kpr(aecd3L8+lFo80WrmlEJTZv^cCR{Xv=vXb`Vs
zyayv`0*x5a@<;S>09f(x;AJjh42T0K{QUgSx5QYV9B0d)iHM*}gS%6-E1Q=JRJYm(
zIV&hAsG!RSMbFI4APxjhJn1ojKtv@s!7zb_l_~C33LN+0;izyn6r^SalpqI;aZs8{
zA7O^D06NhbaXY9_^nkERSQu>r^ok&m7(RcFHW0Wok^71Q`0wrPw$IKy5teIe_}NcE
z=P}GP=m3=e=DrCUMo9(Tvp#R$usF$@BW%V^Yu}F(1FyHMm+S+u87rXt5<+a>AEYEK
zKgR9_2|4cC+C+c*`uq2Hc9@x&?HwJvc`Q%Au)O%!xxI@}228^QO8YLI)WrZ%r(OVE
ziX!soz+5cv{Aq9#9iLV(FhBx40_9U&J5jx!NLh9Z3Z~B^=c7sZ;18-o=uneNvY^X$
zZU9Yy@ySn!7#bSFs9&krX;R<Vn4^q;nv-*2sR#AHRkK)K+tOGqGqA2Q^Q9pCp{#7T
za&zvH!foUVhfNl~@J2^_$InNx(Bkd=f5iuu8=9%G(#py+ROm4(suG<v$=)8G9KlqX
zK?y7C@Vk|>UlSB}ym7+laN)tP69flUefQRYuSgjt&i_<ZSvRM@$LBP`4B5bh;HCTe
zEcVW+rdgs=8BxdbQP(?Hu5O0+<rl@_EKuvGn!IlB>vB1dGUr?de0k~cevw$b$d6}X
z(_Qh)KAs1Z>7z3fbb>DVajT_?LV9&LoW#-|8|AncVswbF8qB39-^HP>QhwySen@Im
zIJCQa=dxQe9VbhFRVjIVb{L~@*}(KD``vp3_9XM<-R_Ubr71NI!svqH$W^H7^S%3k
zuhU!AnOU9G-Ai3XI2+!<Surh*n_tUvqs{G9o8UdvwtE|6qO>h1+)Lt?@jjPwH^}El
z&}IW&y%U=kg7Fd5b)%li4<iTqw9Q%8?!zavF3n}^x98pJZzj0!ZKX#dc_mH|PaPi0
zSl|wAme=JaXGgw5Y{`0hUX*h_Ra@sU+`rynArgG0s)`9C$hiS{Eu8h@@9$H<qyR1$
zFv9jIustw7AfqAYc{G{&`}?baTU8o3(Vqy~ze-F4*C6c`w7Ho|+UtCI>*C^qoW~U5
z3guIB;s&)%N5;RVa3Uf#b@ebCEqHQD3OWwj`oAm+kW{9tOY-TdB3d!pPkk*d#C4^X
zMuuaMN2H()f(r<UQC%=GF-6HOY1eq|q5h$kN*f?bqAvz_)5Y346hpjnQYR>Ecz9Sy
zNC+1P2jK)AD?<*XVo`TlAwvQ>f)JIIls=)--+-|nG%SPKdg$^iB1Kpk2RFA<Pb8u1
zkLnit$-Iv+jr_@+G2q!;US0+^pO9ok{*e5083jRQaC!V#IIV$<ZpdFjrPF7<Q%#3s
z-F3mgF5rW|Q?`OFH;Ts&-m(Q<rU~SNK8}IWRtBy`%sl5M3IZCx#UY0!;8tn&{M_WZ
zyEBxO)!-tdsu8!^;1ZFZev~N@0j_dK+j>()#er$v@72{Ky>Gy(g75DavI=cdXZ~_<
zAW~ESjfoNqsmua-@Kz3c%*=|1Q%Yo0g2J=`H)q1Zt99{i2@CT4_esj1Z}%3S8b7~Z
z;fy?Bhv*D2cV#>?mU>gT(84{cmk79520BlPB#J~I*kOITU0!#B;?3%BDmO;X^{az7
zEnA0P;!sE#u(u*Bht{1)+03Dcnu}3=7(80o-46I&a^(F~8s_n<tiJi(c3#lENGp=}
zN6Gf=y}b?DyidAbdiv^TL}Y}<flf`u1$>h2ufOD(Y+nPLF^%tEGb3#cGDf!k!UH28
zdyuYR=Igz7ckm9p!H$-uuxg(MyT~LN?v^%dP)ijn+ud!i-LF1op!Y_g^r1M{&*%Kg
zK0<wR=wL6eXuqz<nJMnx5vi2F6&Vw05!=4*x-{t9mwYwyorad*qadl|rZM5o^<j!I
zb!lvdo&BTeY1DR-bMgXNOyAfwr@wgm4}m2KAsHsfZ&f11{o{wao0;LLnb1qny1duY
z_LQ4jUO}P7Yd<+N(@a|%Nf+~zlcOVc&VX&LlA<C)HDYM+(=|_ko{16)mseB-XQU@`
z1XWg6zH@dy0QKh}@Qi_(mlvPbm)?eps|=(;2~yDuiYHK=cYX{J2@?E@lF5MiSNzQo
zN!SrpG?rpC875I!L#PTf(hBHzP8{3>6}$@T?EhlHL_jaQ?$;x0VRm+VFhNzjf$0Ur
z?+9zd0CfTe?YzW;-ku(W^Y0iadMFGaFCbm;Em4$W$VqB?dj9u`T~BtjU|HFFS_RZL
z$T8#7r?_82LcrpIz++@)CiUzYVe}0tL;-XPFqo2jf`VL>#KACi&_;zrnpaeW=8aW&
z$r>JfNmtf7$WpeB1y}T2{~174VSG!5e3VUswsOG!Ki54z^q5YDtf&9GdfIChR-K8N
z_IE3>8k%FeuRehsm7-V7+S=NuC(+>js_=a-L-K>3iSMVPt%>#ZyXtE6-@iZPD?6rr
z+p~M)ukv!7O2#}vmLP=JQ&;ycmU5%FM-la!h(2a#Cv0Q{pNa~(NS#t?6aRd(e#cn6
zQT+UG?%QG)KVkOb))yxt#eg9vhS!mLX>JHpEK(VB3wWp3=&9+y_Nb<&S>W^c-Ahzo
zB-N?NWaGEzo;%JW-l8&+czKg*WGOJ<ww-kD9b#SVb*A?ae``Q9?4L-Nx>LK{S(R6?
zaCDKpP%6%##b6-tQq|~qbX(sb`Q*E2^oyv<?6%s<Z+n5>j(Tr#JFZ(cX%dW9Ovb7X
z1ynxEH2t*xsZgQI1oPiBUzdC&5YR}XJ{&ds+@gxw8$Kcqg+&?s)X%m$@ZYt-Mn3v@
z;|zO3PL6IGhnCmcU#+F1lbD`fR#=EiaCOiGfKVjGztbheSSzDPvG-)qRnU#)cLfK?
z!-vzYOi?$axC@XbX4QQ)zTM`3Yieeu1xMpSHIIvqmT!yoQbv)ULGYhk*>LgN9(%&X
z#6$uG5=PuXHPz!lYANqS8u13GD*QQ0)(l971c+-O^RHH4Z+d49jZq-1Mqtok{|fGI
zfA1^|(`AswG`WswhFl3U3BM*Km6VpsTeYDcfn(eKM=*oHoEK1i;H8lF$3^=;_hyP|
z&50XBRHUysDe}<;&`AB?gY!=cq1d^%WKZK6{BHVEN<nr&u~S$Fp2F@L-ACv|&;}Q{
z9cJ)6rpSYNY9cqYVolj#Yg#>6PsiP*?L)jxg)mo=3iY@~T1A~M=noJbn9wOSL<q>p
zIY&4a4j}N+81Aw{;w_(f?XnUc+pMh%>xv9k+wcCVWq+EvM($^S-ySy*MZ{|#$P^?*
zRahMbONkgBm2*~|H?0|&nR9V+G<S{Y7@JS+-|cZCLa<{nQ4g@&1333QLwJJ~=`lac
z`&F4q(e4J?MSMyni0JLwz+AoLJvZ1<hJo<I<m^VDzlXI_wQQp42~K5y-?6U#_WZDK
zh2EYUz4w2<^ksM_wM>8{pADGk%-aKS`O1M6+Q_KM_wrqia*muN?YJc$Li(T$fA#D8
zSEu?x|1?_ww2H;ZC`+Ig!K-nc-!QYN2cj8nFvX!B@t8H6w)%Jh!ME&iP@8P6l<!5s
zuU+sp;^5*6@$q5c;Z<HfV1!3Z$)jj#O}cd6f`x7;L{=k2BuOn?@VC=@J-wOX6y7z(
zpP%gC0zyFAZCw#5*mbQ76}%eHzJh_2DsHyU(SV&#u>}m4KyO1I0!rr#kgw$<!5#$$
z%^(refA<tZDIyf4Vj>Rb1_0gS-Tl4YNGjs4EKR9w&?V4dJF{|H!tVRAfNT^DjR(bn
zh)^kFX=hDtd5R@b>OX{L;MLlM05zy^x2iow=~;Znxyh^dz0E(V?kCPO_&Z`DycM`y
z7qKht`;}?Vy)BrKB&~=DQ+tGVgU0(6HH90TX{{CEaqsb&IAo!uWJ>1=VXAfL_}Zu2
zU5^&|P;ER(jD&U=ahrf=XrB9JY>)R)_l*y)l&+Ef2KBC<nHCo-$DYGs=Neo%%N6lU
zgaMXi%Z~qXw5s{d)`G1^->mMebN%Xf5mmcxx9{_jcD&XWzZ*3fbOxRjiaC@7pseVk
zIc_~*dnXFK<ZZ|*5@>9Vxt0Jj9)){hI9i87i^u8dX#{ZAo0>W~IQSea2+2wZ%>Xz5
z;=%%&Kc$@~NYeseS#fFU&CLx!&-ZtC>oES|L7twT2vDuO=O{9Kh%?9&NJU65@*R_M
z?>jyPfYqVHqt!zwN5?WCM;}4UGfV8`jeH|Ou>fz2=*EB|oROLN=FJ;GqliJ^uxx<&
zd;rs4j(yG>!JdHu0DSwpuDwbHazJ2p0CE~Y=Yb@wJ5{SU#cb~f+`vFJG&Gc!mIivp
zIy>*i$tGIb+N39hOyxlUQIw)_z0+@yBnvAA31BRooLH)I=3wjsv!e3-e6XaXWKM>{
zN1Wq}3o^pD$&@0Z%mNjWSQTaFFOMU1Zt8bKqjvv^oLay=_<p(_VTHkki`T|rrbaY?
zsRj2FYs1d79X^=9O#AzhBySW3#)v-6YQ5R7E-&B!a&>ig2V;><ZPSxN`3E&E_JXrB
z8i^JoyYZ~IM>Ti24({<H2v*nM-5=||*Xq>t7O=36QF6^`Tx=#=jtg+LobUS0-#hi4
zdQ0$Bct1NYn4MJ}9)9Im{NAyMN`0zub}HT%m2Jgem5kAmszl=8+y3_Z(!A=fjkD-x
z2Tr3%Pnomhwm-^GQ@)Lx3dIetdA~^v8yNgi(hm(ORVzgheXcoCV<M!Xrpa&bVmRG9
zFJ(o|QqcP_t9e8hEE3mQbJ_yAy-wHB)YoU4UT8^eSD=81OuHbv45$DE*~H`)j1+)%
z4f_P{S88f%CR_y4csy!h9C`o+61~^rc2Y_Z3GLhgMc)L{t~J5Ih!Z76MXYepYm#49
zRu(J<vOuqH&K)g{BmZt~p>4+c)n-tLcTjf$T^!D|nW<@cVPR5wI`$6UBz?^OT$O>Y
zE&|c_q9Sefui1!40O05p>iXE!BsL*5jc<d6yH%u-4n}Az<cB=-379-WZSdeE=GqQI
zz;qH-n)-V$FA#1OtOi~!Pcl$pnV&u(tbygxwAquxxEr80T>9mr?wc6mWC(}xrKJw~
z`m?~XMG$P_1&|usm8B6Zv^ZokY(~w_%cPv2*WFPiGk~>R22yW{J#5IZcX_-zRoPVe
zZ&(~$nztaPe*<F5Q^|te>qP+{W4Ciszhl<yGN^5A@gliV`B~Uv78v*l0_jW)V^Ntl
zT1Z2Pi{%iePrdU#t5|ZZ?Ve@3_nT#tzs#}FP7f>%3+L$UY_J>qDdBzS<eT!x<yCDm
zF%wLcL|fy-)6eQT7@xt`5UjY#FGj;d-sOf7udnPSboq^M&OSDrmOw~yjhl>`_Lb~A
zs-!X78YLFRd2YO5j03!^(M?}i1X+|~E?l%j=dQPNl9!kyNRgFjuwbf-UsOp&C34UP
zDUo<kksZ09d*h+*CD^k6BOwl;h9HZ81sn<1DGcZHJDRODGod?xuXRhj|BFzr<PD@z
zfR&H(k+b_@2%alMO<^G+rC_KSNC@89n<<-QY1yB93W8A)MnFe4E|s#m`AT0O=)}C=
zx%^k7)YXMlA8b3xQb)#Z81x<kSstWiO8MVRPfi|#<gsk72$kuH2~q!>pMco#38tl`
z4O&8Il|72Ie8%^){mG{lfOZZdAv<kvZ;O-%2L&M@1qXdZ(2WQSBOxKlX+f9)P>Eb-
zLnkSsCL0@Degr*%W(4N}fXZ!cSp_`{3kv}#-to8-Wv^Z>{{CnsGbJS=i-n5|n@!&N
z=W=&=$@0IeV4pnrvd~nY4{b$*qvGr^=`ZfO;%_{(m5jZp)@8jn8k5Vc9k*;iC32hM
z@EEX~#APO$fgh5)wR+LChVl|$nNN!Pk_G8T>Ybr)!TS2h-5n7w?me$gqY39y3grg3
z98UqS(VfJlO8rYZQs)D$`B$gHS{?v3cvO~<W(u^U6-^xTq-y9~S{Yb)!(aDtmJbh)
zmwt~9K_HNmwd?j`EnM(ghU309^}d1+UJ2e{xaW%<x4&!cN=r*Yjwu{`>Vn1KNwDbU
z2C-~v<)x*0uh_x325>#|3A}aWV~}gAkkjmgZHk6~0NAH2wL%9+NAKM!R9qAiWwvDG
z_>0|N=u_nh_(nbb{V`EdVB^Rg_5*vb>~)%YQAh~V-eX4Z>%xKp5>nF6_YLos5t>vB
z#^Wy;coB@jpAC91NI*|1-)PDcHZU+CXk^KbKrd($jn{s(+Wu%ji1390DYP@lGzV`|
zJ}-pJ2ZZsGkXDj7aImqV4`JTn8*QMDe=sux*-=oasku3=*%>v^NU;^4f|Agm!T?1-
zCr%(Ll2P%4$8*ttf@8K5K<yY+sBdz-WItKUh@-*$g_pHrndgF?wD1ystEx<z|4}h<
z7wKyGG~Arp|A~2>%Wx^&pM8r068Q3nbAxg9h?HsAle^8v-BH=ggNEKZ^6)k6-f`xu
zhtPTqdDZsVGDw+3q6<I@p0BcK@W0g`7|L69ha!Ag%idY2YdKtf>31VYBgculc0LkI
zezzew|0h81@*_6K+Ky*)d*DD4*X7>c4wG`}(b6t63(GNYlVNv8SzX~3taVB#%Rg@P
zwTB66GXk1r6YYDMGO2fwm4ke&8>>dzZBLaLiDlY`8)#vX!6N^@W=g1?M*fUHI#g9r
z{_UH^pv&$Ed_Wy96pRF=>_GRcn!j`;!(2smiqz!{=5O**{6NO!sS9?WZEY|_tF%9?
zq~hnc(%`p7B)(%J?j%8zKc9*zVN%#~V|9e*$KUpj13n1*M^Th0oI_4iYviMAb@A5S
z!s=g;cBFq`NSKL=<+47Z@%hZ^>HK?hB~VV1Cm|wq9Px#$_!Cvhu!#OOw-6a1IBz%B
z2o`C}%j19cOp0@Furo9T30bP;ebU^tZtg4QowL2WJ4P|Fx0sk>2)i?74gEdLlD_Ye
z<v7*UhCr+JpbZ0GHYeAM=;)!Oq+4vF?J0X*E334nC2ma2XB-@F<wPVnPnx}Mk&vsN
zNNn*iF;UWGJK5RIHq`<eA=Q{oMn=ZY&MtJYyD4M#rAd&milTaIuZ=+9JG|a@Zb`(K
z`TRsjZ9tX|wjQMSQq2LoAs>EyK01dmDaxCDC``@LE9-0txNuijj|I8X>0i^-3dP05
zz(@iR%6}L?^?TGuhUnN>Nsq1AYWN&Rj}8Btzo)CK{SngZn=+DQkj&^M78SHuRW@Nz
zD)j3p9dYF?#ur{g^zCg_i}&^r^ngnW3Osd94Y_4%#!9AZLKuT7QPz}_Zqw6<Z>Nn)
zI0a#2R{T4VFR$>Qy=EiA=PtNt(^qd-1pAD!khRz|NL>%F1)kUYi-1cWQ$s^6r`~iC
zIuJCjrV?pjFefhdo7+@$eAF2`=rWN_tIE7rJ-wb){Kj=WD;g2eZall4g_RXW?+I<&
z+}x$S;rzW-H;7L?2cIIF;I!}A&gYQa-JZL<m*V2JfHYxeW79D*a<<H|vM=VcCw_ba
z(5B>*%TtR(C^RNdGNOA|)Vk+Eehn_llVb4A;^TkLC<`}Z*XxPiuyE1U^QT>pbZtfP
zRAgizuJ)^b0mx&00XW60Gtjl1A7g9X6}D>-GrIPyDaa<0kM`~7xk4WyWazWH1kO^2
zdei2H4nvbdySWP<>>>-W@ds9XYls*5zsy{6?k~rd_fOhwqW;WQWPvmtVz%bZP1DoM
z_A8j&dUu3fU%VHEWMe{A)pyH+)w|f(;h`Y~5|WR%*A;oc(k?Eta&ue0Yd-m2A!U7L
ze_+-!%b@bKfR6Io^2BDY&A-LOnCcnDh;@J0;j)CFXYoRN&kEoN3$GML4Vt7kmTWu%
zs`mM5<QY%qitT<pP=CXNt1^62#fXZ4Q(z?a+TKgFqY|jz<!&Y4lfMQ6S#JfxK@#Ub
z<_fwuowx7<EwEkL@V#7(#KXl-U*H7FXG|Ss0B!_^NC(^*8W==$uK_0@BL_zm8k&rc
z7%%K(j{_~;IW~~hEvRaJef{Lb9VwWei))cah9ouMJK!}yx-6mSMiIePvU*IPo+WJ5
z0FXXBAlrDtb*<Gm0?-6+f4bx1N(Ai{6vQ4Zz8n}(DJ{Ef_c<pBaTRfqhV4^>#56*>
zY}oX)x0@Rn-y%f=Bqcebj!3QdbmxMRZ{ID}KSx-RXJDAquexXrSXS24BbFNw7RD~t
zy_qTNCoE`gb=+ydL_Vt5J<TjT-u~(5R@vtreZj)t9Jet(PpH1zbDzysSEb+@@<<RS
z+w+~xc2@V5S3kb>MCt&!vg2gR`Z^Oa2jf>&kNy%IgY|0Bfp&(GCgWDHkFi5|<C}QB
zQtjfV{R4}|>hiQb!`_rdd;^E5gb8eNtMp_sy|%UCVPS0&G?|^2BN<}E(!bSNJpghQ
zA0JO(sP&>2E`19>zpd0>w{kp)xY*UtCW+`~NF0Pnnp^S<jas=z*VcLfGA>DN8CZ@$
z$nZjoG~azT75Y&O*a>X)^lC-X&zd<NEB61f(*Mge2y^W(=g7_C)6&dNQtFxAYnd%?
zZK{qdCwv*tY&rZZ**(0!6Gt8J?OS()^QoJ+jG)KX)iZ0s?UA&|{^9FeI?)d&#@bUx
z5E(X2&zT7d1Q(i+X@s}^y;6YvKdrU1V_}hCima|UkcT8H$m4;dRO00@>;4)d86{s{
zQO-nw$w<`IG04qoVi?wCb((@?T`OwCK$J2XPQVF<!XLebJ@$2;SCy~A2EBxIMVVPy
zu)fQR!z-+6#e#kjWiltnn{lx)?Yi}m;&pPNerAea#yi0jsh<qI=N8~Ca}C@iYkHR{
z`ZqA7_1S+<8)dEk-^XsjX}Jp=hP8I%WMpJ}`}<&PZv&zLi{3T&Ca^^SX+luH@5Nua
z&MP17oAP^dH;k$;eTEKs`|XzeC}8co9Qe%C)YPP=rUqQmV96WhKu)#c;NSqJh}!CE
zAfAGsoH!gAcb)}0&_Jb#q@7jS<#(2p1->f3Cw!|lL;*RSQHM)=^Nb7*N^o(R^09_l
z+x(<RU~^h-zB>7{1Lz(Be)+lb<GA<$kjOOOX=QG@1R%ZqZ0RH_O0njq(JW!-kwi8Q
zG&CyESuF{HYpV^RA|!F>=o3YSy-0a4Cmv~ezEPrcZ9NHEP|t3>E}Fd%Q+SW2=IG*a
zh5Dz(0Sp8u=$d13^;Oxc#i>4tb5`C>wRiG>r$akDrv;a>Z@W%c$rr1Bz)k$~Qq;!3
zHqHqpd6B&s=`*61H74wLk--T>%Bis8gS53&9F-D#>3IoKJu#=ZPS^6y#7EA4>HSS~
zcr;{qjWF^G*fAhu==+?30|T{`J3IcP3VL=3bi<E`z(K+LKjK{d&4@jBH1uz;Q3==Q
z57!UOXw;p7?mV!>&(BZ4+z@efT`m+&Nm&`RIiT(Uoc9`3I83kt$Nfx|=cmj}T+vp?
zxk?b5$p1T|P`O)k`Q;ZfQd_r16q>`?BBigTAwT+90sY7)D2Vcj7I5sAk{O2P=3s3<
zFE8iD#f?)<WYyEvrIC0at5z5?SqM06TU&h#3Z&p_ki-pw7jj#G^aJqfXt5C}40Xb0
z5i~m3)1nZU7n~<P<|D7&vkfokR<3zbU05S@)V7I^G16lOu!*Ct>ix)oeXd72fym#V
zH$(J?mOPTQdn2GHEjK{H<O{XvgQX^2J+JvRflf@!l81-I)%KU!V!xJq-+WA(*&G;P
zr_0VcIoWgD@T_+>AtF6zmXusrU5!&D0{G_GXQo`Xmp(v0$>nOM35Mq7=JTssyAm7z
zx4-!vxwwesApcQD0+Wn30IPrn({*{bXU72Xvi^Bi9G;B(@Z;Dl+*)hy<%)T;Z`)-}
zXiUR=Lb?^qzFNPs<?px9m@1lPFEQ9c8aZGxnk2}tk6I5^%L)$+JuM$cHG(A+vQ}Su
z{oV#lhMrdvi1X%VmNyUdY%2cBesf0@3D1-d3*#WauQZp2bYEWBzBN+C+s=m1Cjevm
zznc;GX6lT6){Ms0>?f+z&j{M3+YEfk(REC~wP?X+Y3b$V1@I6AYgN@jpyT1-C;|Ay
zQvld?Q1DSqTmn!vk?qvC)}lS2qO`ObKz|5F%DYDumzm%B4Iwn~LH%xxlDBTnR2f~r
zJ+{h?>xPz>&4648Vgme?KY#ucSh%^l0lpcesso%IgwkCCSP#7Btwh+^`bS4cXJ$wV
z%k6>Y2RMsg!O_YMYiuwPpkZMse2*ZNI#>6=%<8fD8|n(qA#qtdIW(Je&PeHxnhwSv
zoH32smC!d~3w+1Cu*^wzNT>=1()Fyr(P~T(eOHJx#&@hSQlj$1S6mnj*nUYKLGMsB
zCyhcBa5^n&=n1pvr5QzaWng4eVJ0fXJ;4}ffl<wJnLg)J6O)vo!QScri`n;4tJ}ub
z7>ir=7L01wd@<oI4_2Tqa(U_h<!#IBnz~aPw75UwD<12n`MYC@(zaytIPOay-rQ%x
zC>);ppXU}zk10zR1WMj7dAEdGD5=(2OeZf4{r^DwfGl=IR06Wx0;yro+W9v7+U1@P
z8#}tsedi`zi2i5w7J|5dSXID}n70D>Dkdg|oSb}kU?Awv+>|H3fIur0`cKwG=CJY1
zh?{IPFDEDH@rS^6SXh|Ocg+Lv!7rY-2AEqegpdL-!IUq>YHo4y7FcJ&^9njM*(f@l
zv&<wMY#{nbbWp;7vg%Qv25&b$e*@@j#VOFeRuTV{fs>lIygvXe;3R=~<~s)mV?d8m
zQextg(}a_409H1r2e(t=>CvzKrF9WhmKJ$y!CS9Y{%%8UT}5+@qefRbLx^ZjU3hpe
z$Z^m4yaZBlz6m)7t$=f|3cR__RiM5AK=eRSker_068IPZs4K9lGOPw&g7qJ$%|Ic*
z&Drk8{{A8;<EW;Don2Jc25M12n4VqN<#*MDg+~ffNlZR}n`&wjQ0?!353C~Kh6nP{
z29~jGRBPEbi-;XK!5Kz8NP_jF{*lX6=CgK85b62V<k!c&X;p27Zrih+<IYaG1GD6I
zIbbk;^L7TygBLGGz!ef$M;O|tIY*pCz;pMi##gH&o!m%YlX$CTaaLC+IB)cz-&jmz
zMD><WdDzP!^|r#}a-AvLF;-iTc4~28p}MMc{o}4g*^s0EJBWVDT(nwrCEw}S<die<
zhUU^({1y0JV)&ryrG<0)5&9>WT6g~R&b0^X<$oq!-2dRS4scq2n@A}14)hkVsi$`Q
zaR(YXK)3-lknuN7;2F%#wE|kbxk`=loYVDQY;^R9f`WqQ>)-&40M7Xe>Wr%<dkEk<
zpfCd4`BJkNHV+vRF)=ZnBm`j^cUIW2&pFks<po_w%hx{Apv$`Bz?niXPcw+gZi>EI
z=LR0Nr@@2XKJWk)f=W<++YyTY3iTH(sHp+n#>n3qJ33_N!^!@FGX?hiyu3WzPU<*6
zk?Vt=C*S+=_1bEN@YpDPUleN7N&A7Ol;7YGgK`tHvG5CAstoY8&x*KFQc@D6rlha|
z%mw#rWhGBgYh>hr0qg((Iv`u1Jup!4H+VmxqN3ivX{UHk0-R+%WYd%%?NqUd%k2Md
z>)#74kU;TiM5~>C*IcwIs%O6e6}#ZXT=4X+=JbKUg8)gyZL4jKid<*0_HV-*Tec);
zulAY;$4uU;WXmlKnLP2kG+h>brSmpz63uyoVQd92VtG7k*^7hy4=l90pWKh>y}6r#
z?qTxrU1rU5i=)B_&S3{Xrq)s}hhAnSvr#Egnyh{f0|qVruI{y{tE+(m|6%;Hef+H2
zvFPr#A@4*|qFm3f7u#$AJ^Xh)7hB-2fHhr2Sa_x84K|-Z2itr&u(@}C#6>11Ae7}m
z;B-<45pV!?kF+#j2PVY0*jW01@-tA-3Q)0Ncw?ielvFD?27q3X)&9q|3OSM)7FxWy
z1VT&jlP#J(-vUPifb4b*HG-@#$jQh)>NI<8NFV~myIHlR+`D%?1qe++nc~3RfEBL&
z>Xj=H9)U0KUo{3H&a)S4Y9XPafFS|@j2nrMiX3{bOzsP6pMYoqJn)x5ua1a_=(=Vv
zO3`A8tMCjUFf%J6(5L@lXw7(I=K9F!6`4D0+KZs}T-qHSw{w`r6qdUnWkT`!bF~*Q
z6uP@C`2bxFoF25`91`YC=KtE29b+ci1>^vrh)|vfv0_WzU&M$cdS_C~C|<9Zrr9tu
zaBzSiVZ?i-so7aFs%!vlfsckx;ypJ2_>E!B&EiyX?~m7&a&V>*DEMtKI)tmm1$|t}
zHfzB2LdvC2j|2zRBj?}%A~r3yWIcFU0Z6Bl`rueo6VJ9*Oy)#RDfdnU_?`anr&%Q8
zq`snQgOFhTP+HzPXC#0#6a8383ZW<0M{kmGW&+pWlP9;&tVcpa#~Zwj9?*ou`l*Ae
ze^%eY7IchCnuz#cZ`Jk>c_m66dadj)?N$1V)2+A)pDN8>yx2EYm)dvOw`m!e*4MJD
z{AMevgl>30UvwHZSrw)@@GXBP<!_d+<@sPu9@h9TuKm&y8-}U-;e<;ux_VP#!u5P>
zsrOG$0^f@?IUbF1v!<&vG#^^bTeOPYt{Q6LARqaFo%~fh;y-H??f~u<6a6v3I?10J
zqkvpGD?|dkwdCYvOHZW3kV1?iJ0QRXj+?r=I>g#FYqaN^;4uadWqgQ;rI;dG2AJkX
zfPJ!U6whg*qoV_GK#fgJhzts|)FSUt)={f@d3ewih~q;BsmaK;U@(L&UOveJwChTf
zhC={$gHKvSNC>qabVTrHv;bKfkO%{XF}n@ezZk-S`(b|GkbTXYx4yg_XT*q*@H1d+
zLhhq4AfaFj5DB=u0<SDfSlsTVy@mVWP=C1Ps8r^{ONDx&K288B=}w%<r6Vgo1rQ-F
zp{%>Pc@{9@f*k=d6uebDfcX=gUAY((92{H>#ZUzx<D?dI-!y>XX7~Zd$<n3T9lSkr
z<*>1^)HF20i^}Wkfz*eP`}+R&k{AQU{xiLDhp&o`jt+4(jU~{af!q_&<s?T8s`75E
zx<3Nq39(sW1OU=(Y~R6OwZYr@9{441=R&+g#a90Eo;FZpja%yW_qP*~6%j<CpoL;)
zUoa@3VX(T-#mHFQ?#=A;TC^R*4VhBA;vD}24r|`?9Hq<3J3GJix$)f`=JVn=QqgdR
zJ@#Mkvn-a)5je$rn_!|IwUCkrdwr0(dWP|`2boMi#i0B0Be!~p9j<bR$2?8s%RB-?
zo_Njtrr1?MtFifK`aU*TX|i1&siyW~DmMHT@qIQmy(~(NG_>&vTA~jpmLWz&t~_+5
z*TaEKJNbA=cv*@tx5gbotTq#&2YuhUNf*M;>f8ey8Tq@>{{=WMc2E<JSfi93gQGGv
zH3c99Dk>_F!&Lj@N3epWC=w*JDGlrl$#fmUdz0wZ|8|L;Z{kePV*&6R<T`)F<CX7o
zvWZ!MxeGhK0`*$^`}@c)$e7)^Z-$c*=T}?2c_k(Nz}E-o8A*Kxh#Imd-D3Yn7jrN-
z;4Qjh{G9{DByw6=NeS3ky7gp53((f$cp=z*Jf_C7vIqe2TX+GWv1`zeNk+g=&m+h&
zfrW{UjqC)Zw*a^XR|dZHFINoM9k?eg;kAa`qP@$@PlB$w;5=Mh68<-5q3HM>AxO4a
zQoh2URl){>exc*huyb<@^6?=Apuwg6uJ}5Hdq7q;U=BVte*exU7zHj)Ct>Ougt2UK
zF9&d&1o`OelZAFx*VZa1vsvBTgFUr@mDCw5Gghuyh8*Deq!RiwlW}$TVJ|~09id6a
z?+OUkTv!Ut0|Lr6H#gfA`!HS2=6m0q-rtUYNpxXS3GaxaQYJv;bxrFdb*e@U{LCBZ
zhU?C}+V;fyZBue*PB*rC;S|sce@XOEa4oi8%C9Hbnt?v8TkW6o3V&g9EGC?I$^`#t
zN022?lN|y%ciW?LltdiQ;Wv1>{Q9-P(CZS)isXmidY#*~SBkiH$JfolI-%prZjIwv
zSZzJZ4<;0$Y_QXd%A8de#;8x(IzKIY{^QMmSG)=|4<M%E?=nEgz?Kgjb91w^Xyr-<
zRj5qk6B7hZ$~kX<r-V;7*o;9ER5Nk6gG~=fMZwSV1gQJN!o!8WFj-!VqzNESb{b2$
zt%qf2Tgb~JrKx{Za9szQP<jr8x4@I|=FJ}lg^xM4(H%Qsc_EmK6(%Cpa%t+U6DOdL
zrb8acRzcs)=cJ@?d&^WnRoH=dX&eaxaHbyc;6NW=19Q;7t+!oG;{xImpw(sLy8iTb
z-x9WrCDfEbkur=Nvw61b`0m{UuQ|%#pek_`3Qr0#kvp4|l(L-M*vg8jVnJ>C;w(Bo
zQI>RTr%iyk|IN+jVE*PG($Z2jdHKVwZSsUCWffIbi5VHjuU;)xr28o<0_6mA274+8
zx?wzWXy|k6SL_9U|Av5qFK!dt7OQS~xz7*(jh|lj_i@~70l)P>2pTSZ*yi-$=Lp!k
zI0B5H(Q1TdF%~E*VEzGCfB|y&$B0B(`dDfa9%g0~3hFMzUVs{b(Jux(A0T-GbOHW*
zhNPw2K;@Op&@*A-_aI;bv@V^)GE-9{Z?qC3BMpDrYk_0Rybi8xN)`-epe}|<*-y}e
zy9`)ErtI%Z*`Gc=n5&}WlX)fhRR_Ft2}Py$mRkVf0CHO%tifyE0?2D034!XN<jv2o
zSX7~*>N^|^T<Z3d>!|+#7^oLiiN_Sf!|Qx-pb-F;xRW$bL{{7GZ3c;kCA_m3{mA))
z_h%2Hqg4IwL|OSJ=6ZW?yCbNXiQJ}`4Jau~wDJX^(D>CRVgg#)jolADLqmXoc3Z3`
zMnrT>=E-C-jKh&@p&(TB8~S<;2x&Dn`L$Dv<2!Bw5N^XU{UNyQNFFcd*Ph6|N1$lC
z1hk1GCv%aW=d%4f`S|6Zk5xx!Tkeg}*=zZopmkncV|~)brzprS04|AE%*8oe_J6f%
z&Mkbqm=B@J>a>km_9dYr69%3P09TVjz<s5xtUNq82+kZXCBF?&B7hfaWXI?gxN(9i
zfyHp`D^RGuBz0{|Ntpv)oc)MzVE$?bFU~?`C8gX4hyXu7kSbAEz9b_iwsv;T%+AIi
z_j7P44G!)Epcz8$+NYQpY(7av419caq(A_p0O(PaX6$@_ZjGHD@awV?9j7NJv2k&^
z#Vjl=AmV5>L?i>%Nyf4R!Bf{x0nEoGfT#Z7uSn)U7A#tbPDm(|e);_4&H3Ye3wBU(
zO&cz3d*$s%9xY5QzE^+oNS?j=TPHDFN7~ZTGCS{OVCFd5tNE9#=ea+r`lmyyn=#+9
z5e!irhP;iA$IHWQ@9nWTf!-xM%_lWAHDE$@_1OREebhCU+0y2FSsrj7%=WTU>cdH2
z;G<cFxYxu&o$qWp<!V9H+)8T%8ooLG=OrBM82f4afn0^*?rxc}^vmr`2?-h+wcBgl
zD&xAlsY0C@oj^y|D$}Or@8jjz<ifrXU0D^|9kvJbP?%Y&pg*)K26^*@<`xo&&*`O}
z@`A)qCu<8T8a8Zm@->X4=hO3TQ7`PLC`dXCdXmjr)lGjBtUmioDl5Q0!ZMR&?S~3t
zxD8cGjyz;bB<y=QRGAJmS81TnGF05qXvIC*<LkiM1RxN9poDT=S_UJDW-Wx$?g%We
zJ`e~@mWL<`>@&5VyTIJoy!JD8wc#7<>Qm#NvU{{&z({_2d@T0<uOX1t@Q+2Fx`IN<
z{#eRTY*X+9X{G@t8lT^28^6^4_&`lZw|xil*f!t^#1bn-;O7CkeqUNUcz)a3+Oo2;
z!0b}n(6CzPNIrIVWiJ}x_)+1Xz^%#s&(?T$LN>u2yKD38nbtTb(7I_Css)0k?Ck7p
zTU-<uc&Hqlr;8p!PD}s|tcN*Y!aDQoo%>DO?(BM_X>k(u>m0_DxbEqse1tr<SnpzQ
zu7$+iUZczBmFO6_%vZ0jwu2S?@|#c@D7b~L2Cg<+_eb4cUak{+{T0nsBL)AFEf8;o
zyF_=YdJV>^Q3}=rlw1@AnSB{rl%uY%+r?W{++?G#YvIdkV!vZYt+=K`%ht|GG}hjD
z^<%jcD}%cW=jAUFYzrgUbOF79`v%jiZ}H0ov<V)T-Ot`OJ57{ll|*f}j;__vtp>Ec
zzS^1}fQ+i5YaXrmoBBR~OA{E_Mi}dmg&5+kPx4Z<TR~{Po6#(HJ2XhfHR^TI@sD|z
zlH<PLh61lCFRqTQp{C#?1DyCq`u2EsKOs3U4z1+;@Gy3XHF$Al&Q+Lr@6SHX2y-Ku
zo2HHk53c}CpukwSwY3FM>ss5<oAWsfAjSr|kw}9J!2j&4f|RmGAb13(Av`*1n$^4Q
z>_AsFF!E+fesBl>=kYi2Mbe78S{^Pn`AN$Fi43_dXgw4?X!pM*vwNG7=w>^szH++O
zjqY~`kPZ_=Ls<GZ4rjN|+B!NwJj)Lp9i}*$5!|0u1ck0e0~}|;J%Dx;Lgz$?y%HDS
zh8LY=A97hz558K)GP+*PX39a^Uzg3Ya2|*M{>GzWtuCuSqKJh$7g?pHXMg{8*`M77
z_leJ)C};rDBIkW44uMaBeq3bahS}oD+lEzfjZE>ws`mG|PntmhW1+cuowM^(LvB#%
z(gJki0gn&wGE5fgsE>Aciz<)FN?vv@x8B-6K6tFO%lz5YHI^bWaYOEG;rqh|sckC1
zD&90m=(<7*2><rSH6VcSZm7bTl2{;d2j6WFbMTL(=}Oy`rNPkQatywETDjW28iYJL
zulMbjluL`jO{`0Y%2}3jCUa#*tf((a+EGh&HN!pir;sA(v15Q&=_{&dk;5A(76Cus
zFV(v_1`0h}Tgww|6gM<XCriL`!B|P7huDROUW$4b{XeGOG9aohY!^nQL_!H^kWT4N
zLAtv`y1Rx(Qo3s>=?3Wr5$Wz828NLCuCsaG_nhzh#Shr*Tx;D|+}BbRcX{m25FJ!0
zT1~%sjdTpm({t5^FB|SMevHgJI|>dB@&F20pxq{6I!Rv}0TFTIN1M$&(<ri?@KiV!
z%?oRBz2*-S$hl8pOFsa*|AEHJF4tWZ;602g1|#+Uz)vnK%wJSC#QlnkWU6z@V!n5&
z_#-H=wRyhrx%GQ0$0yKp{r|j^$*mvt+mZ%#H@1aZA7;u$?Y!T>1N`VSu<Occf2@VA
zo!yCQlh<VokTZRI$AygIrBj{)?6pS-biKK4k5o=fv@JJQJ|8yq7AscUOcj`ip~Fl}
z5`BF3iV6xwvmf^d6S?Ormw|G?p9tOm1j^fFFWBj3)7H>XX#Q}1w#DBJ^gMgmN4DBD
zIkj_ypagmt4Y7k@0z@Qtev7}N(V3_=$Tx^qWzh^5#=AyWbcAO68A&<X=AF?~+1)SW
zSU;OD@hySVGXy#x+wCLwrW<3V6|tD{en}Kot)~ocmdA{yFS;ydOvEbKQ+ZOiUj^w@
z5ws!DoC{&KU0fUw(Bk{#{gjQ>h*?=FOV#|Fk#;sa)JXWx#F$BmKBD(M;`?^?FDGMr
z-v6bq0fT4bceqJ$a<Y2%GLZaP03b-7IWFrN+VN;$zvOZAI@x6UOIS|R#Do%H>|tlG
z6KKi=C<efB?Dzm|N<{+M>c_pjZ+W8QR2F^mTgv#gm&)?E4f`&7PATF;!<QwlOsqFz
zhj8$k=vCmgA_F@}@u<fPm@sRr5LS=*8$n3Nso_(+-OA=feBxt_)rM2EQMx~n#tl!3
z^73Ih0-m$=R+Zx>gA+jOL~v*)o${YXw|y_7m_cBjo}7FeO)wijEdJ`1CW9uEZi{o_
z-@na3o@s|Su6kD|P@ZxEQF5Sru3F{%J?!4-h&$KEyRX7seSocG6bM{x-90)|>-0%C
z<|@)bM23XWT|?z1g!3ma?1zKR11pPD9P}Iap?!(V_7$MV&BFr-17UqBzR&LI)YoY}
zK|P84k&4BNp#n6DDv}P)%Ka&MTwLMitBcLC->Vp0LT&fXPXU~SW1m-=w*Nhogu>v*
zSWxb#8clq6(sa|u*4TMJ^P=TMj2eXq`gYCv+pKCd91v9~90GHKv9Q@R?Z*tR6k(CI
z*MT0uwQ2hvLmvTla&yzPP(>T)g*<@8nWX|ATx@K$d<itbru$WxI|K$)FMtt=nwlE$
zZ)qcae*?968FgeP7O_uuQn7(PC*9XBw^pF#-sMMYH35Ob>fGVio%zdTv0l;XS376C
zyzmam?vyTUc^{IuoT(iou-a$q5GcT+n}$hlqEOKjUXZXd(y<;Uv#cP06XUQz|H1gz
z_~9A;T483GKkr`S#?ZL8RS~;}PxhQ{;t6LPgK!Quv&$@Zq!EP+ny$sp();#zvPaz+
z=9AK_gD{^RwKAASWqg0Spr($y!<1V+sr<CdeO0Ub!%+qClaM|kRey2xb;kbuTJinD
z6={#qWfhzCV6IiMhknH4`}tbUy2!Aq7Il}b+tlZgBUx2UmHEu<9B`buu{~Svq24>)
zy4yCILP~2S^uPGJ)bZyC0@-mMJ9-0ARkBw?i>5{<W3-V^4s!qBfPf2N^}6bsue?LM
zF4L{6j7?S3sn||>3|IcqpGPV|ZP#{Rl+dy0fHtIJTD!h-ZPaO<M|YC=txikPpcCSG
zbJMk-+Hf|7)lM<B`GGt=Q)JfeMil)~J$>d!;I&6^(Dmlvlk@E|BYyLQa&PVK&3rAL
z=2*T$mO1cP#Iw6~lM;^B%ws0>7TipA$3v4dS7gG)6C%UxADXr}V#GNzI?n#t7ABlt
zXHDDxR6Lf}KQi><7_!)It$R0i(BiryC-uCmZ^Kc#vArAd95=Gx#K%i!cUam&_tr}E
z9T}vzG3oM8M<OgAi!Q>rM%!o{vdF8I>e$%x_TfSpms?@A1_mMUu||fxZzltNJwb#s
z1wCFpLZ#~rI$}3P*`sqJB9y_41^jBp#WZj8vlGixMJo~Dh2H|Xvnp0LRyAg)BcTmt
z7Hi*jgVGAgYD!hMvZ}WB7T<O4LEpEBulYaw0$jt~+_ntcGOGX?V$b?77%UOk9=Zuw
z1-_;h(hk(cax4B8{PnbO+BJ5MW;@8eMUJRmYUm`<RA{1sR;4xUt9EWxc?ZJCg!sxu
zrfXcKtoFMv)Sa=!?8~Cdz3aGlK6N0>aJN>fijcJx6ODB`)QnjKzfZ2Gb99n122ZH{
zGF|9vv-sNQa8qeeP7tNiyY6J4^;W3)No#S)%#zFZ6xwiP!skvpovqB;yM9fJ48e~Q
z29>K!n~A<kWs2JWj0}MbI4EGdZTo@zP@Vhkb?5J$U(CY(+(v7Kq4tII7iUG5;O_%U
zt~J`WJ5%VQ-?LeJKf+6ncaR1Z4Cx9Eanqi<y`dF7q2>aTQGi5}c#5{6Etw8QWO>-Q
z=C&C*B-kkJK=%x!_c2mIF%`R+vS9wR5PnO6DqXUileb&2X!mi~^KzX<4#3{xwK9r}
z0DXg9(;({hI&UU@0oUojNo#J{Ru+EOdQwfS#3PmR$A+Fhk2f7_+t>KV+~a?f`i<+d
zhEn%fTRSTAKl>#j8v40f_xILs)@yXMFJstyI`@an6x5IPEXzz6@?=VOMlaPszeJm7
z+ud_o%tY7l2wWux@XyCUfK}*HclP4&I2tcvuslm+60Vy+=oHDJ{`1LEV$&qn5;v3T
zpF2Vf1Rw<iE(&=U*75JSv4I|lGZ~*Sdxcnc<9G28hu0F-GCkV?6tS<hn(QC{hCj({
zEfXPXQmp<`FH?&!F7>u#n0*GD?zeQm#qhtlIxOBET_&getR%v*+AEF{k~X?cZd!VD
zpkHC|X}Q)+aBnW8Xgh===NxoC_+UU!^e5T*Ocz68Ai=dXf^2q1K!}Ep$*p~DCFZNn
zGh1FIHWGEZXz6d#SAVx$oF4l4H=d<?*?bc7=rUV5`%PW6h=e5R)Q4&x%*U}DPqo$d
zn(E~knT7m)oY5g{ZYNKq@D3CxkbOnS$uq_8*~!kY#If;qd#TJ;n$WKzz~F(SaK7pu
zR$$4}swqZ;sB9~6BSR1p=1aCSp-{=yHbpEH1kCWL;Vrci=-EyV`)pzanVZ^N6xf{4
z=+7V7)Ibs;foP=!FDS>>&i7+2ZA9hi0-NpgA2!<k$|e9_a+u6JG_C=-UKUr&Vne@E
zb==aM0OiqliPRt@YLd9&O;g7|dP+%m(K&XTE?|v;&7KSe8{uZNSSMnj_+xI&tq<bc
zc2yG0DH3p}125>g|IH{7Y3ToY!t}cP{(8EJ`apng)5Xj=B(9%usm#8sh?60ON$MdK
ziuPp^E3pd1oS^)p%yf`_-Deck4=qMI2fcyszF*$LX)gu*s<_M$K$tc;T=ES=|Lo%U
zt5T~5HI*SMBEsjIsZ8wcJ|MeX=6xGV>eZ}91cIOeJOz5Z>&VaV`10j$B`mBfHYTQ8
zIiEa-IZ0+ZF;VsKu=1;@0g$qp!mx938&$8xSf$f|G`8Fm>elACe)Y>}_-JjdP{{cd
zf0GC(wse}@A8+11?HFIvZ_0Ihkt^>#zw5Z3f=rG<CN;EC5k>Dw7KpBjxpNTbg9wiE
zumDNrHBP#uSaJfBTP>%9YTd@)EBPV51)tem$CGTe@4K3V<oavDSBw!d2`rTgy};}J
z{lnIWK=|>9hD6@bc1LGb;p7krUKj(IA4E=`+;7wR2@NU0Btj2s%}(Wp9VjfUHDXdN
z{?CKE)7*f^g^ivb2(P_TExJ+6XgnE8{>kt0`goOptp*khv_bRoryA5-?$?-TI!(ny
z%ky~HPcJn*A1%6+i3C|YtVyhN++W<Ba@sB5m1Go61Ef=f^(FAMfB*j8F_!g?NB{(h
zCiDj~bHAJ*BIM(@|DGs>1Il#VLTFZvM3vgcbA=py2Uv~pK(*M|>fVPWV*6bi&wvik
zyFwXXlSri`sVcA^GeRKu023vbpt=ZRas5VZ*^+4MeRzcz9{cS!4OkxGal-)10GtQ>
z)t6p2P&tZ+dAkq1L)7(gS3k9!7A}F;z|asiC+B2$?q!}h#c7O)h^CKED3B!Qe}9FQ
z%8Zo7*D$v1Ee6;jXcS3s@IW<Y!|}?>JF-cR?v9qtR?{;Vu7fA*vg0{xGbPGdMe;}h
zmUp{7+eNn8=Ht?Cv^~Gr?V2x9e(DZ-zTDetb6DdG7V9m|;0Ew;z276E|FhGRP1uIk
z&%2sqav4F<p9-99=D~lA*?#J1Fn-L!PQ4qS3VPojX?h}2a2_l(STaSa;_YHo?3G;X
zYQJr3lfIKHCoeBfuAH5L01xjP2u17M^gQub%<sL)n*ZB-pwsUh%7i{pzpZ@mS2u^p
zcF*3Ti2(S2AA$FelL+<Y)|uH|L1G}u`lN1f;&NHPj=Ry_B?MXq{Py*p=O1vHY^yXX
z!CIMG<r-_3`!w)KU3$$Yia>3Rh*m}<*ot7N2=Fxbp=i$}ytU55DLpkc(y7c2tL?WF
zhW^B8Zx<tT9u%^u)vEODPdDiHprcPGWPqq>n895wXe>5AUF>kM;AGq%;|b94^#7FX
z`jCM^F$MIjFrmi{<9J@2GO6)w!lnn!P}ppjch096`8esKmDxUWZTA`-9aSvoUG@JO
z5X#O665wZ{S(h~fNhpN5((|F&Jv#HlzW+EQn29zSkJZNH626iiW4BnJBvY20A}(_B
z5`4s_>GL<#<wsGhcj9z)`a+Ei5notDV&?n2hSuGdrm9HuF}h&n@MvqFAA1t&%Sm3T
z{Ut46{9`0ex8U$R{fs^PEh^;xia3Ku8LQo{r>BQO?Vt13P);<#EhSY)sYc}s<pbmh
zoE&#q09}n-&YKAV0kZ}MLc=MQSwfH3hW>{$Me_Mf`b*@dwMISEnI*?-Yx2{s_g7Dx
zHX2}Ie>PD%<Wr9$fp*v3tBDB;#uULZy6kLu-NuS8^U=)l%k|z@FPzG_@~9J~d7bu+
zePO0;-^Q<G6e{!)kf6LON(koKFbn<1ND1u1o6Pl7eUK~)4H^+=5^Cfqx!cNKggtt!
zi(?OKxR#KmdF+hX*%8(GW=)B<U`jHhrt`+qnpmkZP^k-J1B41$G1Sh4o`=i#XPBxV
zp%vjoOr%K96Y>BZD1W?RjXYd#02XDv>?n1<#49f#aT<&@1SEUxlIlfQ9>EU13GP5|
z1pgQE8sU<3?lrqw?7w`34QPrRTteZBDXcq7$+t`6&6DuQHt~5yl0A&@2c|XT1sUFG
zUti$xo4zRR4<FrE`VbAp>Dt`0Fo}t|&4zv_(o}~8w<Du037ZY={r>GuM#la4D2W6Q
zwK)(jQ1RTKqwv18p3V)L?+IgGZaC-jd)O<I4+Hjv`E6X|cWV}*#mMXB%9%PW{sV{P
zsZk_|4A|5Fpt(nH&pob=1YLjzR%-b@GD`M~>E|ARMv=RH3$bNlAmiJe&3Eh!Fzl4z
z&CD@`_^Xdi&;HFMCH#;>l(DC?c)L0r>t0y7CKwB>ZOQw|7A{WYKYPgsBkzrP%G=Z#
zOw1hcKahk7^XHk?$dNPbuOF=0XNSLaSXs(dI4V4mjG%!UEcn~?4vc>G;>hms^|4lg
zjM_wE0wvxBqQk@X-zNH$#wJc&djF$rWX6a~A*S8SVjcw{rj`>E>C_<^?euaQ5*w(&
zl!nC{yjdb_Jow|BkMPIj2WN@D8f6>rH2|`-qEIH84h50H=+E|4y?bZ&!*{6sCNrom
zD3HVs*g<I&d<rCk+%#%s3OPU;p-?4n?|rjjiMHluVqRwf%FV5fUe)|O2s&}AC9pW?
zx7K&L?O%a8y@mXutmdl*E404?4s<^6tL;4Td>*3pLRs!cyL*S8&_Q>1uD!Q?JODzr
zUAb`pfeM0x4i{=xXAeh=_*j*`cp^HW$DVUtJl1~5Aj-5eA;-)bP9@^%S_5^pH29Xq
z=<$IRj&bxzyhqB&Oms(TU21!j9~LW9-E`D4TFunFz^3az8#dDl(Dho_zRgt_46g<I
zIl40pa;`TLE?AE)e9q&L?vmDTjomLP#fZIo&)zgI)3jIR;-L@#t$fwZ|HhAb5J#qv
z)rYjkich$dALg{~V3Dq8L=S!3Q*AP=+ryDf9cac4<fdgoh6n>Sw)nypP4zP4(&V4%
z!+{w;0rcgG-dBA{f!wFOpf0so(rj7ZcSvNNUiIvwT#T3Uqk&pkakqPB@m?|ZRbJ7e
z9ltf0r}tWI8t-^^_kzdo+f_=1&<i12KkYs3tAvUwc%myp7gOj|EBzj~Uw{{YN9$3F
zCoAlS;mrd-7pFYd_+00r(KJD{K3v@#W(qM7G*r4UpR6YYO%G`};UBWswQ6qBb;Y(5
zIobfwB!h{`)A4Y3@DSHsa0$=rc=q9`oGVke*<p0@SrFCG%{E1JkcVe6#!j$*Wytj@
zd(djo3Y`1?^WjK;Q#wOEOL~_#CcFb0GNd=S=+v`JYtPnZSGUfm@2|=Gc9HKuIbroe
zFFT&tw}HY^B^R-#ksQfDLDfjz855{aKIhwOGE_Z2TX<?b?f-mHI1kY|JOGpy7oSa|
zdwj$p?`y{zfoNj`gm{Gk-I{)DogJY}vdVGZCq>$>DspF9YrG_G!<r5UPF|Wn6}hXA
zOqM1&1e=b_Z2FJKme!A_8<d@WmVgh%@6oq4*6x}0K=qhDKZOYF?loVS7VA`b4?;TX
z+S}GP-@ZC!Z)291meP0e^0PJuho=t)H@bYb9d+zeke-QA`esf?1YnSrCG0?tJ}=;I
zqun=HbAMKAj=mb*S95l`WM()9TunZLz&B8dgJDu+@Ep8A`wzEi=}HtJ{IG$g1WG6Z
zKqnwU4d86Q3XA=m1$w9e9=?B~kQU_(6_S_a`po4LY);)3by2=ll8~rNS0m+oJ5c#G
zLqX-o#)%d~%$No}p8p!}A>!Rd_RQ8(vSfa{MHPP=^x#OgX*}d2=T5evPkl&}IWo@L
zplf#Ut(A@9C~!+yI>@I?SA`Wj#~DPV$t5I?Xo1}D4tZ%v4h7jJiu9TVg%R|rE`E(J
zx4~0y1sQaJ%FFN_z;@a64+r;>vHF08CWcyS)r}Oo-=Lxga^Fb2Z+|Nv?IHFTKSZ?j
z`mIroB}r_ClWM7{DO<}}3Xi^2AzO={l{q!7qc5ira_Vq*u?e08eKO&2xvG|}EhD|!
zzs;@LyEXr7{Qp3pZ@cH~qo)Q*#e?Pv`)>@Lbee@?>U)}LlXkl$IBx{4+hQ$+>g<G%
zpQc-*tyrg9gUuA5W7<-Qm6a_4GG+XzUGHvL;+QFVNW(4s5pT^`u1(#fu}_l94n;3B
zA&r^?hY`Q=^yoG&oG^6$w6y4!2MQG&vIuR7UYrV=CIi_3v$(xfbd%n?sl{xEQv;ch
zn|`MLgFclK3=IYDh4hmzl1rjL*~;t)=rQx{Es=%pAOUZh0vYmmM+h?Rf~OY^1cxD`
zw$osrE!w7tk+XAiTdg!xj}7camyWlcTptT#6FnHumfhY_!{wh%>Jl1;*8-NNlZ{?N
zn&Jc?xk&B$QG?(}`}x#XPtTge68gsXKSG7j5dTCDWZ`bu4*C}wUix#{a%$}1swIxZ
zgdd>bAYBiecj5e9qsROZhwN~=SmPSK71?XNc}P|L=h3{t*Q8l1_-SCqv%}SjN2mr*
zM=^ELBLEyJ^=0b*DLUjo>zp`LzF^~9mOZDk7M~-3UBy%R8^1R`K=UEHk}xp!eLY2m
zfXU`mdR1r$BdgHLqAqPVO>IJc#&X{PG%Q+-ks1eDODm}5_CY@K*5BK+Srn+*-*jgT
z*r>T&!TNLaB`T#9KCVlx{QO?r)^=WNW@2l1I77%V`=)Fiq5J0a9U$8Qa+7v&!i=;o
zJ0fA_lVT}}hWx3gNs5<toq>zQ*^c*0GY$~_Xa{Lm0J^10RjyvWVwll=nmOOcrpKFY
zXR!*rZn${f%%NEuQ%LL>lzPLVGKBwt^u5PScG`^Rvx_HDW(pQerLR71(JakjKK^ja
zxkZjzb13;{HR9NKaueeRit>~zo#OA_us@}`0Bq)L@o5z88!)Ynip5?F4r@w@C=B&8
z_T%76CI0Ynv3L&;8Dc@&dC+0rl?Lx%((%o#PU|*DJTz)fAr#$+h;j6eqjIvzYSdYK
z`vO3lD5!x=-My5B3J|v<Gxa8t=<+T9q99VZ=f!XZnP^4%VmcK3ebS|z4>>(8AMea>
zNMFe_w2|Z>cDLQ|-IHCe++)YT9xXA_?8^-5+%`c39&Nv?@eMByi2xpp-vc<686yL%
zv(CbpDE)M|-&L&G<0MML-`LjD0-Gu20U-f`qz-wwR@c*)=O;fZS-!r%V)ko(jO65f
z^G)A#42PGSjsdcH>z}3n&55|K?l7x~7XVYs^Et1V8u){dn2}LeREnZ9q_8zRys8Uj
zB|Q$8(8<DcbDl^MUw~?tUq@HhbCjJ)GVew1fSAPX*TuC5Y;gwc2>%q~vdL!WjyX;<
z%q$G#LVOOE;-Io;&X_KiHJr#^nJD=sJ{th<&>Ra@^7?%(Q=2ekKi(P-5?3|K*~szx
zG05(1W8KoL(vlc0-XxJtp~LN3+c2?+h;)3qHQTLQ#SEIJ)y}i_rUvGtfvUT8&DgwN
zP;PqN?$(tVnttX6Ew%WJ(cl>L*N>$w8TrG_L(TF(7*fW@>7rF}85x`IXd^5mOIX}<
znav8S9Hoh2%<H0oT@5Fxny#YYCX2=CI>a7V<KDRgZ{^6qywkRIiQ}_qz#3w0nEw`{
zNj}y7>^-_wXJ#z!IF8bA;Qt^1dGLQ0<TCoRv@?#OtCIi)#OMo)@jo;Q2i|uV(HNx0
z_A3j3Z>X$XGst1JwcZI9fy3b50dmr!R=v54Db=nEZ1Ok+AagcP4HpZGWna`95V54y
zP`3Ox+(xsK;P_ZrOt39&aH=o^*pkZm{>sWNq{S@=v2zqWtJCV*+8Mepj`M;Z0Y4j%
zV?of0{Rs=G?x5r69+J;)D=KNQG-<GoeP(NA$U|=B-qmH<mbF8nhb&S1<7zqtXXLM!
zO}iDna$mhL8Yg58$#W>Zx~%jjTxsU;--T&9reXIYik91fTwJ`~Fb**`PGZuW$!O>O
z`iGm2f^XPvtC{{b+D)mNm&ZE84|}RB!G|=h+@*7soisVaWYyk{q^$T7mzv)uLZ`V;
z*WSY(Le6)3^DNU<l)Ku8kGtHVPCNh{Wr^y&&ff|9J76}FRdsYJljbwq2@v?Cec_S@
zvGvd?X*ToXCKCi%12Pf;|L%-pbjZHwR}5|KMZg2tDH_5QhEZ)hU9<{FUVC$g9w*SV
zceqDKYyL-oUU|Ii0kohk&X&zCbow5ru?~ypwRLyLF4yC`Z2Y~Om^h)5b%#efZmh3A
zoG=pG8nTy7RZy3-0qluQq3HS#^M-ZwnoIy{7zlXf(jRkpc?&THWDTRK_L|tvRi;!1
zbz$c~zKAHvA6+Ln&Vm%!6O7>gouPu8;qvB-+;1*gIgs9{vuyP4HsqVfi^{kBiaIEl
z4>ys*CMbm}<f_dJ+K*E|&wk9I5!o?1^CnHqaOfvpe+dH2vsnP|(P%-M=Uugwk;KF~
zWq{$62{)6A66R1k#ly52f%$+?EE@Hr{x!ugcoAiRL%(-1*NG}M&2;h_q*-Ihr%Wky
z4n>GAp&u*kyXE}1G?6$aWDtr{I9yO_jIW*<I(4J0Mo+cjY$c0@b)%fWFoozq=TfXY
zi3VAlokwzL1ELUa?_yefZZR-ohfou@cNbk|>4DVY=caGnT<q*dy%A&cRfj$E-=>Qy
z4LToknDpBLq4ECWl$h&a9<Tz`{*M&^O5uLM(>*=yJDE4QXBjQrT~VovVY}uxva$j=
zm7gANm*3&W;IS3pb_#J{uwVQKtH44K+(h&tv$K^z?!wMcC9^CKgF+JSlXKGxFsCWQ
zMXwX>C4qWMH0N`UD3Dz-cuH55oQ1<CKBrp4!x%2*?sMNleYnrtyucn|%<BGW7xuV4
z{aXhHa00Kqg1D`;s{U%O*J7spG3rD^^q!|Tvy0!!U3i_FT3DYQ^rh&Sn-)m&l9??k
zCNDXCA|YuHzW}#zepV0ulo=3cR5KzE0$=rkyWp%;N<#vkZO)gr4n6}SCxU8;Fmw5W
zt-F}$Mv<yV5`B^WVFWODos@2m1wJ7Cjz#Fc*WF?V0UxC8hM4?GqqK~Sqmva7O0xh6
z{6I%du}Qx)x4rFh$2|}g+jIn3-CdvTaIrY<jVwM`a{Kl>poK=E;_|Gl+jvqmcNbvO
z@2}VT|KkYM+T3A+0z*|=tn#ZfMEQV;c^p<_1@3gEv+NG+sxo{oH<d=ZWU=jq|6kJu
zd<c(iqa0JegVe{2%D?mIGB}a)rFaDj8}w{}1bI9}DJtEx$uj7%6sZYoH%@7$*ucjc
zGEciD0}g@lN8`0e4~_gW)u_gKt@>17NijW5i66~N2i`bse&?@``{5&}|F{?fzpR8d
zBYlUF+Ib$O_sN_b=C-lLY+)`<=eB!I)7VZfZMLH`LQz-PlV|%PHb71a(d5Qndyr|;
zQY5OJuJPdcp<u&x=0z&V;6KivPGGQdgqWff)i-F*BdSZ*R{#b)TypW7^Lra{8$tuJ
z=ZbSSa`*1<zq>#0bCf|7R8?KzetZ1iCk`QIF&+3bFu)F^h9OS^vC?2x*mWpcInd+M
zb#<6wyV7j@a05iXuA-uKG<h78`reNLy$IrA=muH*ihl?}p0i1Vn+zIJ{BC=90H`(S
zM1?xS_k@iLyKK8Vo-pzA>yA!rJ>HE1QB;*7AuP1CuEQz#zIU!-C0b~&0P=`si@2Tc
zFZa43uh;(eski7mtTiD>p`Pr4fnUL4!1h*IN{jE^)BUrr?P}ZNp#N)q2HrJZe?u~5
zR{}_*tr<;(xnS4z*+sUjOlh8S*qy=Qe&&MTE1L7>fAAq{lfRp;{*diE5*+jI;&zD$
zfn}T{@&L6XocMcN3dsR)RleKXVTr?ioJ7BHfMQ9Af@ZlT{4$(L4CY<d@tw{6rse_{
zr6~$-hD~L0i>@Wr*Zf=jRpx)+H|D;{{JR#3b-G&o6BqGJhtHuX@msw3F-Fu31Un8m
z7D{Bu8ohrvAr*B|XqbPAK?ix{1Hgk^p`-Z%Zhs_i?{crpvI!1ua&k5@vf1gnKoz<+
zXR2mfZAr~}Hge^&4)0u2VF`c=&6P<;0ouXh<5{z_k9b0X<OHBT26S(00TM+Q$j!~>
z{dEovRpDIOmU2~Q9bYtVIqnN92>24t;a38L$A}hPogYHiJ;p)=T2E~k<MA_!j{bUj
z?`<oydowvKPM4?p7w5_sX)WnoWG(F8#Qf$Shd6XJUQB2FOYca*$eNa{QX+wXd{!4j
z2&o<X9S^Nyl@D7)?M=G)*u5)$T5<RtIvjE8tM6@Z3Sb>|yu$#haxN>_FQoZw?S(Y7
zta-^nmfm$VV2-Dqgm=Iq;dV)2sovWnvVG?W$Z&9Y)g=W<ul6zKX!mbyBKhp8?AYe2
zJ+b*u=V;I~T9+3qL+!|r9N+@F0lch{6LdYj64d0iWS8Pah_-i5aN#ML;#Da6$gC?U
z$(?9MDA6IoUfHy9{;E!4D|$xP+HY75r}*PGzT-*N>aSc3_+!y{03ch*(5C>Xy(!o5
z^Yd)i*j~%VYGmyG7}NVm%0#}A)B@RI&y@6LBc9a$%j>M!R)=%H#I|;?i$?#^8iv@>
z7Wap%{`njY;;ar2x8LE1JI|eM=F>^UlOIk2PXKmtc!)W*$?`}|ZCzTEx2TFOiny{Z
zI>VKTfWq0OF3bfjeZncXE~DpXy5Nf#%f<Jf(gf>rRzP1R^o+4xotWf0?p)#h4n+$%
zO&G);#4OEfQL+$(cYup%G-fTk^=NQEY<-K>SFWZwqm7&W{!J~DY7yX}g8+3Ax2tqS
z;7Sm8q&STj&F%dtP=Yhz$Sfz--kOQ&?w}N!8Q%i5M-a0hxOu(FJxCXbaE!Xnm~YJS
zXktASZhxAoUiw!UW}XSbxq&XR$MPM$W*2D_5t3i#&YZIYq}HV+)$;_HYRJbwZ~e|0
z4@W*8q(?cBtE`Ud@;aa2`d53+xZolq#M%q&vo?NRlkWNgcm?fTpHSMb&rA+V8ZU<z
zmVIEkpFnSvE&wR~(=x+_+1khvHG$UtqH1D5<VO7-&`_Tq&c}f!0sL0%FE&7W?hOvB
zE8Ty0C9W!iz@EQs4DJQDdnToae4<hYr2bB`JB4tu?E{0ofs|>&F(yaH8q#TF^Jylu
zFlHi0%jr|~{HCMkrtOVfLcm@CZnH6798w5(Dn(al91gRSk&)8(c4mrMGynL<$<GCz
z`hl0fpCOr2#|4zh@Vx5dD3~NFRjRhahqGcNRqm(SD*AFurZX<CE?O0$_;K$7x#7ey
z{(Cv>$0}n!QaFR@+6^+zV3&z`z4u>EsEtib6tnoKKFZSK5l#I(X^S7;0(pC<e%76v
zFrz~`JAjRDxflwf7;>0)P^#P7n?UMaf}E4Hb};nnE{7tVv&U5VnImqKns296p6BXd
zd6+|L&QFF?TGelfy`zsa-)Hgo|6Xbu7hdAgY6T1vEp`WiA~q(%eGV<HjGU~|%$tXX
z-UrpU_pQ;^+?EO$62$Z;EzJVx3il|bf7TpgbQ@~BM)&pQip~H`b^!<?D2zN@g4!~7
z`P%0ToClp%l@vEszzc(OHs<Ko9b~aW!I3AE2pJie<wxB^{z7ubo`o0IqwquC>$yvc
zBnP%>pwr7jCr#cq#bwKtl})bjL%P<ZHVew#C6b{tOQ6oyz~<uW_N_)6TRdE4n&OX;
z^c0oC7O@?1g)s#T2c@qnslh9hC0R80?&K^#y`j{O&L)e`PA}na8f9kBD5IQuZ*C_c
zfNWmDJ1Bko4A{r|!vXtPNd98H2Gfgl0}lw(aaaHriQSl%)QhZ#)+7`RWsMRsXq}?z
zlf(JB--7!I$-v?;#%OLO`hgo5hrGYRPc0&Mud;<TkC-*MoNO$<!r}r3oTzOj5c|gr
zKo7{i%gCL@$}}Arj!xu_?D@leIW{BDrLi96K;-Y!Neu7s6Zhq6&|(2oVLbF&-bQl0
zw6W^tYQzGsN2RzjI_CjF4G#JasM@Q=3Ot{2`?l&#{}&>yJK)24P5?~)?(^Uliu!jI
zrd=c^&9U9Kp<-{7HOOdzae6LkqVk{adlQi9TA4{L(y<>QdM4WK+7;_+D<JFoeF6o&
zwsGcQ{4oOpB&s|#fWRJz2&++bDA_qZ+R#`g@p=UZ$A&L|()p3j+KXdE^tZxC2GQ1*
z`CmbR=u4teZNP3d^*7x=TAdaf;r?eLD~2UFPdzEZ9Ir6`q2Xrt*Mh6D+|*pUH<Lm8
z0$`$yy2Na0A76U2L(=}>?Ft}wpcd!n0V!4&5d6y(2Gs#@X#_+Z8v;eE4T3&Sw-Uq?
zoNy)gh{DPic)(|<`b2QQ72#io%^U#8__{Ce6oW?10ZUq!_3TJM7MRJ3&1Qdjq2SyF
zao){?%k@=vY#%pkgOQlRM<U-A!b#*o^?yd00-rv?2@QNf0&X@%ruP$$_>&}`es^34
zTmazuc|}`$Qq0epESHbA_J|l}`0jx|Ej`^#zh!-M^zCydqlLqx?V}hMAz~Dx`;Dk-
zuAZsQpQ+_Ecz;XvA1JcQ09Uj4pVA*Bl}w+A6BQvsCAXkQ;IHi8dr1fkb-0^uwOmr;
zr;by{)j>iP{PX@FV?p3~F4y>gSC(KfMwM;6D(SJ<+v!a_4o)O5V5WnE3tG5fw1>fN
zun}CAn&d94Q44^}goE~n%g7P`I)zSOPqGeLr4&6p$=g(s{43s-$A}=r(?bK~2;h7y
zKM=Yn&yOU_3q~rBwg`EAR^cN4=kQy+oXxgxk`Y=eok^nw=<>cc8Z55i0o^YWG2)9y
z1J(HS@!u^Ot1|#!GQ2x5|2xcrN|VLGc;l9+u_FXOXPC>?DS!;PVc>}So#MboIXG<b
zm%jt74sW^%v9(Ny+D&><MJr|BDK>^cAF<*Y@X{mn^fDZ2?`$OWU&G-c01G)BTtGaq
zW}#Gw{NCrEG!9nl95c57WGuO-Tf>bAq#7TthUMkK9<al|4LCRy?Hc6&VGG{D&a~NT
z$eS%fbguq8Cmt&@<md0#e-ce7oqZ5?OouLASh)eku_zg!j#qw&+49}sUh62y!heZZ
zWemK|zqwH6CjZ=AL7DCxma6K>kew?F4_pj3(aW5i%^7P<1G)Qar0T)nHv0?sB)=%<
zqe0lV73T8@+mlf#R7xy^@o?C-M4Rw|gZcpa_y5l3z!P^l?*CMJrV)FF7GeGJ4&fpI
zRS6DT5V%g-r4)f5+C-QcOzM!`4UOuOF&Ns*S*d_=Q1<(x+7uWF01xx0o(CUbMByl2
zcM}HW9h!gzL@85Q<>U#!$9<*PW7tLuhQ1tz2PgZ<){fzl{&NzmSLY8alyK}aGcqgS
zJS^ldC)+#GY~-^oSBr|s>ngftfB7Wfe!~S974822l<a``0N|euy#3$b0th0wyw(PR
zz2->>q81^y2ooir0loPqrXd`W?s|?jns@Bx9@OIS3QqgKZygR5?u9~1?`%P<DzbKY
zQ6ay{X$VDRIm57wbD@SS){DNYzh&V9^3VdW=aDfaWeT5!E^e#}tTqp#c!-?OM+pm*
zd}^p5zh|(ubtw$0*yCUwkWMN>7=ote-<|zMYPz>nlyM2av^E1$Qh*ANG0HY7a%Wq@
zzOOg;22|@XSL@WWsbp^M<qiaoOH)cYU?7fW^!J7wR8dePQqfR^icnJz3`#$D<_P81
zb2yuRhfAjYzHsoze-I81YJrds4tjkbN1s?0hWP$h$yCvbgAfTV2kF#ibpjPfp3UOv
zNVB8S8#rKr;9pZwmFDR6<e}k#6ZQ_3NhUYKbf8P!xBq4z)7~D}C46=|=@0L7GMPU~
z>Tl@)>58?UdoX^JnbTx1?*AN3A}~>PdLfobo%{6Y!Z^D%T;l0ct5k}Kf=2ajwKcaV
z?9x`dO9;&Q?)G-Ks;PlWL=gAp*9}yAwIj0CRU|&%w8c!;^{M@csk_^9nadRl_jU&l
zedngu`P5jgXr!r22@dWX_RE}J?gRDDo!pfQA!td1{M$7Dw-KOMVn8MRb%T=HmhXN?
zue)R@&Dmz@yupOiOxvG^UQi?N+vFm>x7rVpvOaWC-R}xzn!ORp*Tz#gQP~9K5+RH&
zu5XZDG23)RdR0%r?+%O6MHW7bnT<B9*IlZm(k0IehpTSNU1gmZpcGhZO#KCqPqxsI
z%U^v`_*3FaQr-%R8VVh^-~9Bmt72Q3P0lipgHc=p=1aSZMHgR|KvkVbrMUA-7H4Q*
zc|*;&w<kBOKhJvk+^!DiShe*)p*}77QLGt{zAv=jN>AmBYHNd%$mA}mm~$f}HC)A0
z=X6-x(T&8hHM0eu3X@K)J{yY+3tfGD>~BKxJ)5so;%a2PBdy^08=?3&IjZy7iWcRR
zZMV|c6m!>1<ji?x25+%wRAa^_r#4X@4%WTtI(ki1<%xoC46}9w7`5+s4F2R@2zcL&
z>dj^9j_Sl+4)!p^=NljOOOsD+daZ5V^j2Ch(^75z6Y@scKDfu`5~uKGX3B}yN#V+O
z!ze+KP1#Zt!0!QRoe5Obbq%VE)a*{~MjS9R`C()@S>;R>{o4nYSU<d+r{!7XcT5b@
zMJtDgdXo{?hy4$HJc8~AQhR<bQ$=WezF<ap#JsCRmz9Jbk>S+*Q&^W4&}4vKu3Gdn
zgS)|IWo06ouz-Z;t1aW+Y6g)&TZ4^s_P7Ey1wg`&cwZ}J@tMk3>ipZCCL%>p=wdfF
zg*>c1D>YcR2=0M72T$3px2LBjLeVVO|BUEsRD4OK@vj-~dd^$#QI?4~$`uftJcY$|
zyn+Mh4xna{j|oNS4L`j!1ZtLWuK#M5(K5-JL}p(o*x5~J=f>H9+5%Cn7P5O-QYQ<{
zB2Oc2HaSx~jH|RE)yp`jSpPKV`lXwajiLQXN8d>?sZdkJcQa+H)YR+Nv(6b*|A;mD
z{yk_@jug$~waqG!{1cn09CKN(yH=7kCRno@@sXy7v_G=NR$~rJnKGl)QKxfqvQk+(
zcg!H{PXdzRV;qxeO&24HCrOfAV#1#<!kSF?k^PZaPcv=G`y~BNmKt*d7q?<%Aq}!4
zW%{B!VmV@LIf)KAH2>&G6jZ6?wY+^ZJL(FH0=NhD$2xlGBJbE#5;{zW`eMvemr~dN
zQR}K-zaz!yklA#+XqRgS*$^fXXpTzI@P*RI>gIUe^Qrgc)@4bRI<syOSskhkD_k!w
zKeFZeLnYBuho0I(^t$UkIM>$xECyHVMy1g7yb&&NTJX}T94Z{r|D1KF{%cPASwLj!
zLJZU{HaMj!P#8&f?HuM@wcY9o>X~kK;wB%Yn7TfA)Bu9EG_B@v(Z0uP4PJEJte@<g
zGNl3^wy2~IuLqq?u4?-Uwswmz^F@~&&w7S)_Jt$&q%V8<oM1R<Y-InGra-u{^&#f)
zUg{Uar?P}k@g5x~>t|scOp@0$><>dJ*v=p4KTnP=j)aa+oy)3>MNxwc!==ujn4^X*
z2LDDtvvE*a3=q)cRErZ39!h)O5=m~MZh8<RUhak5U665WR#AS?d1Lr_y-ka8s^V^p
z_)zA1z+Wi6`arGd_orJVV&1^9%-apuO5OZAi!LqLe&krzG*gpZ<jU$~jY)w%pSiWn
zM_XIh;dXyvI-I!%20qsxjA9ZAp=-@jTC-)85ZE98n=fTQIV&nA&Uezk`_D6RIR0!T
z>qx~dot-}@|32;3=59HHSNpK}cBw9GrZ6nbUpcf~b1(*fEQba|zGpICEeckL1aCPH
z{nBYR9P4NH3hwv&mq|1TYA2QwEH;k%@I!7<$}rgj10&<{bC#P140`CYA@3+gZ@jzz
zuUIW>U#&5@u4QZwKyiFo-pq;g134I3F0Aw(y4{!PX0pg`HeSUm;d-9jh&7iXND`bC
z;8l(4DZcXj-U!53)oXvH2MiWTlT_R)%e@m-S|?A}G{ZZX{nQMFEvjYBtdqIgW=k2b
zhx-}SY5Uh=md;o{?v~f}(ty>+jf@15vSxFpHZNm4{79K+KWFnzrGp)xY!0WUl(|8q
zD6&U>chT8c>p0Hp@{-fbixFYWvHZPP)uM24G`-SdB5Ikw9vg0KQ~bHuK^h!%ye+L+
z_9qJQ96|(7blOGUOg^%?wVXTI)&NWJc6IVqW0w@3^AYhBZtiJvOKkbp=^PXF;+a1z
z--vsym@FkbD^Bg!W0@_8VGr?mLQFb~!e|$>#lGjQ+o|nHiwA$|XN&G>Po;8_Ub)VX
zxm|{3qO;@E=HBd*S!FO+{At+6eIhf~ht#=qCS$dGUw!Z9eZ=GWLCkA^gFSz-sG+gU
z^F|MGtQh+fgIgB&p;=_XaT2EZw9{&bu9Sz?3-Qy9zh_q#Ypb3)5j=MhVcaPL)t6N>
z1)KBHux}jQ_c{e7yOS*MUh!*JAn$G)^c;mD9UJ<IJU>c)q*wdGY_QpACZTJeX-g_N
zq!&BQGqy><V{5Y%Mk*M==P_4>jiTH)!^UorCmzXmEeLvl`7c+0tR%_rIlKN;LGgJb
z0N>a|dW$Qjp|P=`yM=m(lqbOJit-)L1F3L~PW`80o(ILW051EPqaQ&Yy%9@h{{HoT
zg4&LbpSp-Q;fqi2E|rzyu8Pv}t<qypZ)y3!^)>UqY08wp>aN9{KexCnaqYqS;=P<l
zR<mXvU?&)GaN+=XRQmS5Jvwa9ERTa$!#Iz5aR@)ToVSUoRPL^}v_-{DPzIYu!+d<z
z6#G+fyDZ9~ylVznvY5d^vKVGkd%=m>2%VCbMe%H1{CdeDa>l4UR8(RspC;X#--f_^
zUtr0G;BuQttKIkOlJmN^*G5^!GJhG9#P|JRQ2wAHzhJLPTxGy)MM2yhTLvrrwp06s
zdTr8JE-1MYlw8M&nL?Y!5TA$LlD$DUD?hrQ`hMYEZ9_2bpuI%6^rcy2$0oh$_`IsX
zfqT-w4ym!hMvT&gd?h!V?6yLC37kQkdGcOzg#4~j-VAFs(Ql>8m<E?j>hAnHBe|ET
z9ED%XRH!o%uU>03u0&3&s<n2eoKSj{6YwGHkJNH=Y8pgg4<_^02U|>!%Df9{Rqrnl
z?hWF3D{FXqLO3-Vs5?C;X0gfeumOq2=~7H~u@W55#>@>xr;ZVIZ?uTzy=CImRrPrN
zEQtNf%20CJ_K{lcSvEm93ajp-UQrTLV#})cynb?ehN!Dq8Iwx#c)|d=kPc56m@Yd%
zgE=9H=ny)(5^Z+7aqG<Amsjy)3$<3N7bj%eN7G?yuPs3LDN(6KtiF6YwZV*01R-1w
z)ao0bbrZa7iN>ffon5Z&?aVk+;sW?wCmQh_=Fziw;yp*Yo~e7%Y8EL)VR9L>nY;yq
z+>PH}fdzHjFqy}*WIxIzeQIga;XL&6k@G=`uIMIAm6>ktFz6T|xIX~+V=}{OG!o&w
zUo7<wq}ql}Os%fr4?lm_*x(Yd1Tda}Hq&>PIM36&J)Bx7ov)7-{jdQ<mIsKyFXpSh
zK%@8a<<1=(^DfMtT*YV2Y4fWLq%>15dY9;i`t3eRipQ+yv<6rCWol7;PCFU`IQe;&
ze6eqE6iywDI(gzFjA%RD+FNj`#y8BCKM6D23qja#y-CiPvLS(+r%WoPCAQ4=5C>;O
z<we5Noz9GG4`m_9=Awb_BhB-WWF>&oAAsh|*(&+hEGkG#x92V^m=;p)h2|W!$qa)7
z7U{9;1cRGE3`VY@Kh-;vzBr@4HaCjss^`0RXv8ua;Pm+Rh}e9mYo=1gxn__uzn0FC
zBSE!ZI?*C9@r&{M*aNA#c~<j>>ms3n!vQbDUP^yIM%8sNVJV2iO0?C}q`AI%gxB*X
z?n0-3os_goh3X6|V7oZ1(|2+#E1jamqMpEV0&cbS@~fc;0PhF?AAF8N2QIyvmB)nk
zY_%=E$zvO8g!+|04dzpR`<==J+o|B_s=^8yJ-Aiiz}|`5BXTiDY*+O$oQ)yaX|HdQ
zkU<l@!!OJ8YHN71-paJQHzdl_?@55qErMubFgz+STiGG=RaX%Yjl-HkN*$MoqBL2&
z)-=-ccx=6M|6t~*g!gq`I95Z)oYU(ZZpF684sS{_8$LH!ax=)-X`AjOW^8QfhzJE?
zQJ-B^M-1MRi!GZWvx5^8bKtArc_^b|+Lu0HSD<&rUaG1yWjKR}W39fKaBx470p16&
zUvLA!lNcFtTW6_kZgY6xnEh8oN42`m^zCkT^h{QX#VEP@C{Odo-!?1N?Gh6MgMvpc
z$tLf&)-zJN7%QEEy8&6b7K&5cgr)pCu#yV)!o|H=%6{`?<*!GF@(M|lsN1I|&eo%-
z20WCozmbNIf5i@6SCSbLNmjvLpy(*{b+2S5c`qA_9B~1dBuWH%@<TMC&3ssJ-1Z%~
zbIokXGAv{mx-CI}PUigvE&`3H-9k+Ttqh=6P85<@<YRjOA4`+}rkmfNEw9>u=X&F!
ze6o!@orwWAnLfU3`)tc?9`-3BsLA|jAD?{y*%}|rL8cmM_8WipPoP@yu<0iC*eqzV
zhr%UjdsKRs5B<tw%!kziG#9h7I9o1xr{89lj*cx{)nMbY(&=Jh*XBg5-4+k*?`3_=
z>G;WE^0is}nw*?ex{1kTo5vF88^k#GT{t);fs>G;*XaWW11e<~=;;x7Vj;z}Z(hIV
z18}w>z_7x>NgxG!JSBZDx1%`R6nkApeM;`e@4`hGjVD4NyKkGWzdKbh<S~f(c-h+I
zkT_%{-*#lu-zhbKMqr7=<>Ev(TF2RwalBld)q%R)o}MJ~=D55?jF3%i*-`MTgrq%F
zY81Fn1A`|$CA7+LUWi;%Q{`(6>1&my=ziYYCC*w~J9$6s0lW&hE{l^Z9vTbO=M;4b
z$s+<zDHFG=9iz{63Y1)y@99wec(iq)u#8xe_%&cI`H2TjPg|l;wqlRzL~(FC(a4SC
z0lEs~fE5fNsQBKupRdm{Ora7=iY3{#a}ZY>ChF+5cpbG(=1UhO$uz9y+Sy7$#tP}&
z<~|YQml8wdC@4VHI^ujLHh+8L!G|=817~^SOIL>=vvLj73pW>++DP+SrsxY-%mHS-
zPs8$HORR>z$$V2+Zuxgt@f6bKpwX+w#wCSSlRnGh`ymV;l~Grlp3vM%UG{kk_+ypA
z0jy+JS6-pBW+&5JS1Xko!=jN`FMXgcj030<FRIxu7>~Z~iyX<c=It?GRb74}{l6PA
zy0$;OF%=+5SL9EfTk^j8>GwDgs3c|Wv_EqjQ5QvAo+Z7!nicgmyJ16f+Lb6nP*>M1
zT3j+Q>hXqE+n`mA|4v7O-ND~~Rbzvd)wG~#L8~@1zKR{%TyAx?T1y2Wzy}q_EtFmI
zs`v&8K@U>lyuI0mpjMbF6w9pHE4kiVlYTJm&|xh>#})0N<kUoR59)i*Wu*$RKD}_L
zRfs4R+v}Pb#($+k!2B5owuW<nixv(pj{AQdIB)*k_%w=DARlRW&c&ia>ddXY^<WjC
zXFq%OS{^)R9kGuxDPp8TrV9DVG^I%+9PZal+V+!y7o%gAxzW)im7{~&Hf2FU((=tk
z;JXPjwnk4i>~uMPwuJgczq_QUtP~)XBb|8o`iRPRl3l<%R6wwyquTp&N})Tt=c+{6
zw0k9U<V#8p@5rxrxKa&`+e+WR8(y$ryrcNq+1nexrCK~tutnf|pK$g~z08u~<Q4E~
z0n8Q3!T7|@500}=I*UPkCCJ?}1~S0wc(a3{zRWr^oF_pzu?lhY<2S7uHS5pKE-*@;
zQ*!MR<sj72tVoT@db<`Ux!HpR#kX%M%We2{_M<97^}7@#DCX06KFH@Nw8mo)7JWvI
zS>y>`UP^d1`23VnaeJ<K=mXWZTMqTRk7jh_F}>l={O)Aw|Bxnj@mY6Z93D^MUNXkA
z{NnzaM*UH0^^`D&_Ws|NSEWIvT|_L*%atSD@^iWSq15JFl4n19LjChCl6-m|+MaAm
zTo^zX_P+U#iu-xP0){?PC3~%;lakX%L@^Ylzh%5OYBDia>Ip3{zy-Gbsnos3u^3DX
zj2xyqoNaTv-!N#&yr_MGVI0b5W^xjbadJ$Dn~k*SmtuxbYcq^+Ix6*l7w`x|vdQOH
zc^*0es*>Lb?@&kP^M6Z|2#bSJ&do>bM`3lXz8Mb@@!3@!bni_SPz_o+c(W?)L<}f^
z@hbWs*BY?VBNcRw;ktwVww^$@Dd7g37`0fZjP5qpqSYv;yJ%x1A_^J>do>PPCS<*$
zIk`O_j8UOGyQJ!<)n-&NTdm2@US-gf$j#;b2$$pW0OIguOW%6ueHtjIz=RpURY5%L
zf{P<wc9$beGt5U(4$QkG!g2rpF>B0~1~od+;6Q=N(iTq$#EzQ25QrTz<<I=2FnOUf
z)JfF|`aP`2g#SoMbJHeGJ>6*a$B=2!|HD{Zwt}Bhk=0p_(v3a|BQPMoDFHJNCXw77
z2sVu1Gx{CodUe~xzS5-_`f=X&wx6^_G^CnmxG%Z@jjQvAlN%M;%8K4N*G=orKV{b=
zBK+T|hX2$mBU)X_o{bf=;}9XBYwOtoACpBo>;f_`xhVT47rmugvq1x%rsZXes{RkY
zV?@qW!D}Qu3l<!%<N9LorQZD*hK$ttZCe~D`XHWkY+_;k8}LY9XlARgck|S1IBf1v
z49LwpglB{?@#m+4uSyEK8#DU(h`V1oZ1HOA?l-DRXfN)M|7}W7Wk{D&u5m&j1#PPH
zRjo)Z&LI~9K$+*$Kz4~tcT)}+q&NTFJZ5_W_YLg_7IEMwUch<~U`gDZpVF%^tj=;X
zUgevjr>9A#&xQ5JXJlNNJF=vAu#t<7r|`*_ut}WGd{@1`+-GT{e5np!R2bk9!U4rD
z8~*N%%fI*9jg8cYEqK)D2JIs|iE~=B<%tr|$Gi9xnY3gatMQO(q7(mPh}jjP^(sM@
zV7b2iT+|3@_)Na-T<O=MxOR`&k>>9+f(4F@^Yyc>Zjn)B<neK4goK!Nr;{9)kcE?V
zQ>Y?v%AX}KT=mP<e-H8ygHI3t%6`=!{MIycEE77iQd{Ug4D{0Rb9DGcKRir4KS`z>
z9t@nMZw<v4!5|P*w6O(?G0T3yNfAMLvT0iT&`(-0+M*JUh&DBrwZ1OWcUfVWV>{Qx
zEWMi+^oEX1o^ZS2Ma@9@PrLz`bD;T8?>j!!=%to;@z%gPHBb6PT4it!2Q2tK5_svH
z&mMU3vsLT1)_eY?8nPXOc@szAr6Y#Pqk9NBK81m~qHR|i7?$ekTx2aQ4%?IOuk%R+
zr7Wv8<oO2?#`fehthset5=^}dj=nH`eCfD`Q!@Qe27*Hk`23!$F=@`_{Q#ce^%OSd
zu)40O_!<L<R5+U#p)Zl9Wt0Kfq-*<Ix?)bk`)s2NOS@JsZklc&Y*nDmZdq#DDU{4^
z$*_a`xf3Lk&{9g5E_)U$%(jz|$@=EAzRX8QCI5SkXUBcG+$K7<aFuY}VB3^L{Q3V~
zAxuypC=wxEfli*<9C>~V0xz=7Z;wwLobvM{OKIf+0RakY6rfnV+^mZlH9FVz|5Mp@
zM@6wTdz2uG0VHQhk|gJ}AUPvxNdh7{XIMZMB?^)e5SMUa$-98Wm9Qiwset6XfCL4U
zoa38?``z!p|K6K(_MDkBv)x_Q-PKiHzv{Z4F)z={F1Ce27KVwP>sT+Uy85rhkOO9}
zU#LL78`7faE|8)4(Sk7ar}nZr6=-sIB|qS(E8*8^fR|=&qUPPfMzMOz*|(~y5uzY^
zP}0L^El{n}W)XB-Zi0WgWM%;AefG9=4@`m9!J#;9gE-KfZ9Bls56FY~J*kn1(oPcK
zUQ73>1=c?SY!52rLgYh$BS`?S2BvUXIF;4<R67I(0^%a!Ffs2YF>7gWq#CKu0)a`f
zX=<!mc}Dz652;=PJwy47UFYo@$QGr_>*&X9>NXgcvG`$^g~S}vh|uBI9FSJp7U$-c
zXWtD82jWdh4sd(1L34az(b>~GfV&b0Cjh$)NidfkC5^d&%$UG<d!<jNES~+kk8Kow
zRM^4cw&q66@`{&?<!2WfHVp&U{uenQU{YlWNHH;W-Qj&I$77m^U!Um$B-fZ4@F)g(
z;AFh}#gJ==_y$Ymh9B*;{9?=|*Iq#f!QW8Uih1T=?2XAoyOlIj;8u@4pDRb0Vngo$
z{~Dk?wgTtY)T-(Z!L2B+`&Ai}4ECZUFSRfn<!_1b_m=hY1YAZO-BmE!j9zYQUV)}T
zOm=M2@|>KT1zNpsxJG`g%`QdqPr12TUfqZ9Xvz)d`$?Q#g_=xSy#i)tWcNceGks7^
z!L)a%<s&_Y=)4z{2m9z2Z~ez!04RQWjY<$wVHVYyz)mF3)Pv5>mT;?w%4{F76G=0y
zP`EvpXO?XC;um5`i5e+UIPWz#rYbcvGHYyhrp&l6jex5Rw7#0m=md8A_W%3}szLC1
zVG#x|2QKLYgaT8Te-6lD7F0F27>;S_ch$&DIrz;)WLTOA9jhn2tNb-BiD^g}H`V_2
z8<y$-S*T_jC9h(&oh?Shm&>$LzVY{(#=cjiXZtqR?S5(GTs8U;<f~1wE=zJsCAjQ+
zz4y^%mA`w9zq?!1*@mnur5h-xhL%rXxe>&QH&R+)4uTN-t|?RG!FvpBqV745CD3b>
zjy`U_^p*iO2jE=A;l~NX-x#Y<%Z!g^`Z`nuXVdVS7JL+Y8_Gbglx9>WPs+aRH-4~@
z6m?@zGjX(<eX=P-SS2Br^$dlJ+oS#p2ROoSdZveJ<x>>gzQ2Y$orguCq|$gYdr_V3
zhlfv;6Yf9u!ry|L2s2i}!pHZ=OHR%$xYSeLZ4jHdPta-fQ38wqpO|>M0p1-*e(zDE
zKdY+q;R(FfdEo4Xbzh?(BWV6znGj?u{HDX+YWs_BrMv%8cd<hlm*05PqCPoRxO{7i
zCAv$(r1q9pc4h^S2;jimn@Z<hsdxTuoLn&BA{-=YTEmgB@OwfbFgGmofAiuKp8IJ*
zi_OjB1Iy1#%#@V7{j%ND(p@~1e99*WDLh$OJuep}TuZDE+<cCOZ_W!+S5+&Fl;+;0
zSLG4Y6JyDdAkC6gGJUreA@-q4KIA<5OWIj;e{<61LI;>kS1bS=tQ`HjW%&TmfRjtD
z{V+TJgctrEm&Isj6LCY}dA@vamKI-K6eA}f*$Als(8K+tWqRU=bJZPKz{PegcBJ9(
z+mTs{cbi};Gcy$3<HX{UZA&dHSOID#&Zub^lOBHBpcT`wxY!;AKpenuZvs3!cVkF=
zTQ=Kby3`Z7bm}<uI{;7i)%3S_a^f0{y_S6`J}>nu92R>3EK_dKp|<GqUXP#4>{^t9
zek3mm!09I*!j|J5k4M{0I1E>hm<u`8^9{6GQm%_&MX?+Su*ZoVR~+hqjiQHNXvy}*
z2S4S^BCR38AEE!f>+;>LOA>(eMjuu_1K;9oe!A=iz=3<oq9-r@UTOZnj~x6-nU7bA
z#3__+ir3`60oe4w{fZo1tbYD&&3Jr<s`6Hesnq){2mQ|f8Xf(l1wtE28Q#ChB&tiH
zJyjG>719Ca9fw04>&mC052h#MwlXNLg>M>|837?*0HGMH2O?h88M-I}>j%Fke~jiD
zuA`)CPz3%a2VngO@JAONjGts0lQ#Vy#NO>28`KP?Z449ufElKI6MJ@<F(KxG6c(27
z%;#@*Js^Ic=aVFn^W5u@p^$?M+>ZBm1UMElgad5!TzS<Y4-5Mq>6&O{%Uf;(y;H7d
z9`_bIFfMug&$N$W&jhQ9d}k>FeCx^Kj)8RV-(!Pz|0TpO4YhwZSpc$LUi<g1(p6<w
z9*V0Sfve$t6&eUA4F6Px<)IT}fUm4_tU+-^%1Q(_j^m$2NkkAEX`R?l5lwy&^tR$A
zmX?<Sz?8Y=qNb_|i6LuoFEu9WG^z!nwt+@0_5zDXoQ@-g9b>#>ymepc2_+EW4RC4t
zic1|)Kv3}o%KvAZ?o)q;)*x+1;fC+g`ALc4tgNk6x5c0FU%w_O+V%XwA<m8KjTTa!
z0%i{rKPJ0On6YBsiQlA!*(^|nw)s)85=IH6i^s~n@|bu@EdPr^4L+bMl)Sv^Jo}+!
zQg$C*WQ-JyJ%pZDA{y4ZV1*}R8&hX?#0G|7{4(Vy!>uiDTn?c!3;-!Xmx|)%9AOrA
z?c;x)2xy~(OZec;QKw9cg2TV)u>?N@fOY^8gx6g^9%UFfYTJeD6uOK!hP$bqrF*es
zXGQCS%W{bEl9H0pXms#;9gwmi;u5yPbQfF11Hzj5Wm^hQH}G4d!jP`1C8N>IHd>&U
zUbf*}y}v%Y)u*3E`MqVKY|NqI#TU{;AQvT0>tBBtd}!zme0<|lOvwSRCxw;?>AYe*
zZ5V%?D3d(n?|EQg&&qQ^Qaqg;F7Dyr5C;-B<<4eHjCypeGyUwXA=|kTk@oB!FgcDX
z1lt-rm(E7g>FvVmR18F8HEQ0)wJ`wVL+%2=jJ9Y^dY^>$!U@~yG!}E~P1+a$8MVt}
z6Y6iG-~TOn>b*zSYEqwUj;ufr*>4g+yzerGitSD<1Ud7L%fxU=TM2B0+y`iSfENyA
zdU5^3Gi^p{Jt!qJSm3@Kub~KV@+C0<20R0PIK*C;5+D@d!7&iTiSuCba-85@14;y7
zcsO$1m-B+~I-dJ4AaWAtW#Xl@CdOlhw1DITUXeG|(i9nSf*e#fBQJ>lRluuG;`65s
z#|D1kI(aT5PWb6M=0T(NjrlfPc}i^zkESiL^~wX=*8tJP`2__gWuqW}AWO0bfQ9~W
zgXjQ2-2Jz*xVX5xQ3{BWD!&I}4rq%U7|fZUDpbb!V+*#m;aDgT&^J#6bMue$dRdCw
zk-OdSo$4?dU>5x{>&DyAAI)AV5FvSpo?<b7@o<CW@_WdGvt6$#`gB!05BNg+SiBnH
zz6W5rxHFM9aZgU~+#BhLSd(npjy^KE*TI5aR9eH-fIR0;!fgz<ulfUJq(>8U8d#>R
zH*BR#0M|YkyHm>SP;+2#yl%dB5!7EHg}3M4?1|Z-G7cFX6Hh*X(h>YtjU1<1-?>Mn
zM{dl%Irj16dhqRobNiW~rB}`*AzcFeT*WbY1E43as~~?_y2X~-&{8g%W101hiNf`>
zb0Qp^TrQhXv98!N5ntY()@J)78($~&&wUxx!(}s|l>(trA)a;qtVn&Uz|y0$F8y8t
zdf%u5mDLNE9S^S8G4E-ur3Lw5wZ1d;X@gR{DcMF=iZ0{v%o?H5=iE{inT{Bvsos(Y
z2zkwS`-6E;J8wTeL$!=U^;h`UGNSbK@HE~+?R5hK^OG7rg@FA|*YM5WzS=Q+9BQe{
z%iE(xIHvDE?lsfAr5A)3bbC?%bJR+mrUrcu)#Di}_p+TF9W(3LCN<3{BTzMe<1<xp
zU+12z@;l^ZG=j3v=`@tc+xI4W_4Q%~nM<VJqV{=Do1m&xXD&yKS)hP}-?8$_5ksTv
zM!<1Qd3m<6ujtea;cl7YyBV3rt9M7to7IAxetZhB_Bc*GC@g$LtzF@P_=;*m=6*ta
z=y%RFGHAX~{egh55Yacj82mVc+M~I{UQy5HYY6>qMhfI17Tr5cPZbgId-Gd}`H7V;
z^fxT_*5VT_j<S{AZb?7<3M43lVQ;Ekcop1e6nNw_gby3*1`%7RW#v#G0S<cO6KUTM
z>&oe4#kSzhud6K4g@f#ox~M?bBPmPc1hIq2>aIDXVSz2H#qC(ZLB4Wwb{cKrMhWt6
z{A|0?kV+5n1-V<aeW>3}OR;mcoJ>-aAFS;H)$!0;!4>VtYPE0lUp;*qn6<o}M@I^e
zb7>enUr*mx#jVg{Q~4#!TV>M`uIoRpyJT!MK+)?;Vh%bQOn$1tdZzD!@M;PeD74QR
z%hyt(6)`I;)S_`JT=)1mu-Cq|JkF1yq5~amP9wKsHw=Tp8-eINrMLJzjan<d?fvX`
zvZNkGHhi*ewBLU<DeeYa-R7LKH(kzG(M4JKL)6}Q92t&E6kud3XEPIP1ZCqJ6s!}(
zD{CYtTWJ*%ijnlBEl`*HzRqPsDyE7EvmZ6D8_DmFCTQim?e%;apBc$y;hc6A>|auB
z5t2x4sO{z9<#tAT#t^u{0zvG{_9=SCH_x_$MS5qNbRI7=ML3d7+1mhG2l0CXh0VT`
zMX)PswiQOoEi}ZO&<DTxZI80F@W}`9!)<4`R3juPcFNf!Gxm0m9?$1EH9Povb0zku
zK#^OMgVL*DHA66+`|)Ajkj<PS6}i2%s2Fz=NmGs#_+rJCvM3Z;%o;xlCi{@x+XAgS
znnB!6NfLq^4zcXn?o>r74TK%^SJYK-y9-p>do8}$iE*0x63R*{L)+xtQP-R|q8Kl<
zu#xg2CftKQ8ks^ZG?{>qN~d=e_QWmf;t4)_!LUX|G=-q!gIbLcep@Un=2GL6dh-ZL
z?V3?*Z|4{ny+wT?Ns!=k$XN9T_%ln~Oh1FW$D7hL0cuvtawM0HT}#7D5F9<R#B*5u
zg{+M}%m^d;wdy$@y8TX6(V<*cFZ<dAU=v$n4jr~>7g9>hq|kal_SM7E(wk)Rs_YD=
zL!`Cuq3_I`o7`}64(g6g6Czj|5%7c+<yZQto)r(SFF|jqnY^VoZzn|iCDa@`1kX*-
zom^+ndA_n)Os_;upxn%eLL4Ks8QB|7SeIyf3ag2{srHr%x@hkXAH4ghwVV{bX}u!|
zr)~>cZ=39Y4<T1H>3uCYEG-t=Y+`6BJ+j*fGtYdez_r*xrCO~eGGz97l~jun?CZtF
z5u;8p-t3hh?a3MfY4%-Eg7tJawW`6?qrWr4(o??@Vp{4kTiYde+A!vTfrZpn@Z!BU
zc}9>8#J<Q<*(;VFH~svHb*?@R&0JZNDYLb}<7!iV2qVvrX)$khwxJyUv|*nz@d4JG
zth|C#WF^h6kr8_~e_p^&Xd6pU9P}+pc1x^u><74?);8MH7kfv?#2>xn_c%Bm8P2VU
zFO0%;x><HIx)H?VO6?{;-03{bk4qTLK^jU0SKk{p#RWETT5egGs7-@8xxU$PPorCn
z;;E{9BO@9{Cb=mXKk6%mqT-FP!sSK;#b{MhW&&b0=p*^{oy2Yh?=9SAK)5^c<82_b
z41H&y^e1VY8ee=g7NhZZQN6q^O>;X9`n_n$eOduMD{w<Vf95yU&ak7d{n1S4dGLds
z0jADU`xPNmHQh9N_tOfeBJF;}6Jka`FDUpv3xrQX&&X@41MydJrLybK(j?^EnJI8K
zFSj54$r5f!B13_!Oo({Kq`v>*MC$$>d!!@_AA~!3^7v0uPpsVGzU|b#zPin>#Akzl
zco;<4A{d(dr&nLhd!Tt^#b9q`22+<<80_#i%EY9=Ijd5T3s3Ozsb=6NXmnbraj`C5
z3Xz5$%#HZ;)8>2F!OuJ8Y$~*YY=%Wv`0lB<7f?fk7If4)GxN$t{=@q)$#taZeosq|
zt4XR(g~!Z%MrB3a4EGK%Z_DXu{vxB+U;PgB3<6NI0%gQNRT%2g7d{!X7F(KNWwVWK
zo!OA0a<j#_$8Rm6;`3)E-{!J5qrTit`0P$XRQ3FPmsJLWYAlFQS?#O)c*OHuS9`Ze
zD5EWCm(ktaA5zaFpONU&@PQ+}VT9+)n*`Ap7Gr=g{W7jd>PR9Go6-Wg$b!eooKhq)
zI64?->&zlc5!#||=z)R-(9G9pfy}3gS)GI!?k_YbBfo8~Vy=nty+680TRz!hhkz44
zQSFqZ4T6mnqN?rZD;lPlMmtOIJd>{AA?~2hlZGrpz--w`|5K6WII!B|s3lv{UG!SS
zLF1>*H`M~mxdQ*pH}{732biS?C+xl1*@tLn%BRe~exYGYrEL;vS5aC)DhZIgqElqc
z*5OK!f|(sbO8X)1r*8f_vzD}k7^Em09?dylQDKv4Btz3$V{ir|=6eR@!kETUaE+by
z>?EITb+nY%eV|!xa~V}~hcp>uXRxWR{%w>N8ZqI5ORi`e&_?XVUt$Ie>c|TgL1mtb
z79i=U8+`*MxgqzaJgak67Ezq1FC}o~RB3Ow1t};YdOw-{rQ}u&fBtDS0nTTzqq<#D
zAyMwTUwO+%Q~r>fWx-^lCM2-OZ)jz~Ad!rz19@?CAzCVf^Spe}vn~!bE%V{q6!HQy
z{uzh~_x@TN?=5P9Z$i$z_r6!ZXEE%Y6I8M;Io@i9)b31l7%XFib(?ZPi!rqeqV8;5
z5;LqyY}U`vZ%>ezJ;%7A;)U}0Vdt^f_^(#qO8aZ~U7DBYkc#Bjb<Eysr9@Rxd`m~|
zyPzl7K7@Qw*2UlUKN~C-2m7?lZ#MfzqZYG&QKT})YtZ!;Z>D~#UTXNnj%X@aX6;<;
z(AFqu(W~B%aqh3%OrJi8`QoqKp<^rE?yycWTR#c)2lrAb%*m8B_&|i$t|=xIzd$u*
zgp>BsXjdTn>J*{e?u5RG8E&+9!OU{f-gLwd??=nFuWs$i+PMZ)YZyonE%$h4zoR-|
zRoHcz+wR#gM;UpJB_-cVN!H|xj~x<fes#LA<9Y|(bSsbfc7o{X>q@^?;4~J?YRmOG
zv=BSFAbfX?Q9JgTG%E0*^vo%$tJ_s@%W6_9VPI@1*?av{X&>pR#rgBST#{E50^ua*
zA(kuR6YU5&UfwM5;U3v?-dVC#pG+<9xxdr%I8(vM+}NdcTdf&J1K$h@SFX)`g$Zf4
zh&^<+wbqOjLdy5|4fRF+cRue$;0rs{v-b54-tIqm-%+mDmE=gj7nY8+m=Enedt9*K
zwrR(x7PqN`?@5~SYvY@9CPu7p>QH07hpov?vmK(0e;+1h?z)kRN8hNWR$3+P-~EW_
z>{|E?OE7Ahrp@I0{f{aLqUyYKn<1Dc!fe{7U}pYjFiWHXM3OsD0Dir`B}G(;mk|o%
z(iFI0u@2BJ(qXp#&;ZEyA$=Bb+-qx@esfDJ*8`5Gj94Uy%-!^WAKFTX^T6N^)*`yR
zJZ1s(!PPC5+@%$XeZVEeT8h}~84eTHBEVj^A6!}!z}?GhS7?9WmjC6|#tLvt@I3^V
Ypy%Gr<y5#i@Q9<Pq^(#k|19)>0K9yLTmS$7

literal 0
HcmV?d00001

diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
new file mode 100644
index 000000000..f1b374adb
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
@@ -0,0 +1,264 @@
+#include "ck_tile/host.hpp"
+#include "moe_smoothquant.hpp"
+#include <cstring>
+#include <set>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("t", "3328", "tokens dimension")
+        .insert("h", "4096", "hidden_size dimension")
+        .insert("e", "32", "experts")
+        .insert("k", "5", "topk")
+        .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t tokens      = arg_parser.get_int("t");
+    ck_tile::index_t hidden_size = arg_parser.get_int("h");
+    ck_tile::index_t stride      = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = hidden_size;
+    ck_tile::index_t experts = arg_parser.get_int("e");
+    ck_tile::index_t topk    = arg_parser.get_int("k");
+    std::string data_type    = arg_parser.get_str("prec");
+    int kname                = arg_parser.get_int("kname");
+    int do_validation        = arg_parser.get_int("v");
+    int warmup               = arg_parser.get_int("warmup");
+    int repeat               = arg_parser.get_int("repeat");
+
+    assert(stride >= hidden_size);
+
+    using TypeConfig = MoeSmoothquantTypeConfig<DataType>;
+
+    using XDataType       = typename TypeConfig::XDataType;
+    using XScaleDataType  = typename TypeConfig::XScaleDataType;
+    using YScaleDataType  = typename TypeConfig::YScaleDataType;
+    using QYDataType      = typename TypeConfig::QYDataType;
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({tokens, hidden_size}, {stride, 1});
+    ck_tile::HostTensor<XScaleDataType> xscale_host({experts * hidden_size});
+    ck_tile::HostTensor<ck_tile::index_t> topk_ids_host({tokens, topk});
+
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({topk * tokens}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({topk * tokens}, {1});
+
+    ck_tile::HostTensor<QYDataType> qy_host_ref({topk * tokens, hidden_size}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({topk * tokens, hidden_size}, {stride, 1});
+
+    topid_unique_gen<ck_tile::index_t>(topk_ids_host.mData, tokens, topk, experts, 11937);
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XScaleDataType>{1e-3, .5f}(xscale_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem xscale_buf(xscale_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    xscale_buf.ToDevice(xscale_host.data());
+    topk_ids_buf.ToDevice(topk_ids_host.data());
+
+    std::cout << "[" << data_type << "]"
+              << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
+              << ", experts:" << experts << ", topk:" << topk << std::flush;
+
+    moe_smoothquant_traits traits{data_type};
+
+    moe_smoothquant_args args{x_buf.GetDeviceBuffer(),
+                              xscale_buf.GetDeviceBuffer(),
+                              topk_ids_buf.GetDeviceBuffer(),
+                              yscale_buf.GetDeviceBuffer(),
+                              qy_buf.GetDeviceBuffer(),
+                              tokens,
+                              hidden_size,
+                              experts,
+                              topk,
+                              stride,
+                              stride};
+
+    float ave_time = moe_smoothquant(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte =
+        sizeof(XDataType) * tokens * hidden_size + sizeof(XScaleDataType) * topk * hidden_size +
+        sizeof(YScaleDataType) * topk * tokens + sizeof(QYDataType) * topk * tokens * hidden_size;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({topk * tokens, hidden_size}, {stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto i_token) {
+                for(int i_topk = 0; i_topk < topk; i_topk++)
+                {
+                    auto i_expert = topk_ids_host(i_token, i_topk);
+
+                    for(int i_h = 0; i_h < hidden_size; ++i_h)
+                    {
+                        auto v_xscale = ck_tile::type_convert<ComputeDataType>(
+                            xscale_host(i_expert * hidden_size + i_h));
+                        auto v_x = ck_tile::type_convert<ComputeDataType>(x_host(i_token, i_h));
+                        // y_host(i_token * topk + i_topk, i_h) = v_x * v_xscale;
+                        y_host(i_topk * tokens + i_token, i_h) = v_x * v_xscale;
+                    }
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({topk * tokens});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == hidden_size)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < topk * tokens; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride +
+                                                                hidden_size);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride +
+                                                                hidden_size);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16")
+    {
+        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
new file mode 100644
index 000000000..9f9adda90
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/smoothquant.hpp"
+#include <string>
+
+template <typename DataType>
+struct MoeSmoothquantTypeConfig;
+
+template <>
+struct MoeSmoothquantTypeConfig<ck_tile::half_t>
+{
+    using XDataType       = ck_tile::half_t;
+    using XScaleDataType  = float;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct MoeSmoothquantTypeConfig<ck_tile::bf16_t>
+{
+    using XDataType       = ck_tile::bf16_t;
+    using XScaleDataType  = float;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+// runtime args
+struct moe_smoothquant_args : public ck_tile::MoeSmoothquantHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+struct moe_smoothquant_traits_
+{
+    using DataType = ck_tile::remove_cvref_t<DataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (warpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / warpSize);
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % warpSize == 0);
+            return ThreadPerBlock_N_ / warpSize;
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN    = kPadN_;
+    static constexpr bool kTwoPass = kTwoPass_;
+};
+
+template <typename Traits_>
+float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a);
+
+// This is the public API, will be generated by script
+struct moe_smoothquant_traits
+{
+    std::string data_type;
+};
+
+float moe_smoothquant(moe_smoothquant_traits, moe_smoothquant_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/14_moe_smoothquant/script/perf_test.sh b/example/ck_tile/14_moe_smoothquant/script/perf_test.sh
new file mode 100755
index 000000000..d1e848b93
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/script/perf_test.sh
@@ -0,0 +1,37 @@
+
+EXE=build/bin/tile_example_moe_smoothquant
+
+$EXE -t=1 -h=1  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=80  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=128  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=144  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=168  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=184  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=256  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=288  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=344  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=376  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=448  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=512  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=924  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=1024  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=1078  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=1996  -v=1 -prec=bf16 -repeat=1000
+$EXE -t=700 -h=4080  -v=1 -prec=bf16 -repeat=1000
+
+$EXE -t=700 -h=80  -v=1  -prec=fp16 -repeat=1000
+$EXE -t=700 -h=128  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=144  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=168  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=184  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=256  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=288  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=344  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=376  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=448  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=512  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=924  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=1024  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=1078  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=1996  -v=1 -prec=fp16 -repeat=1000
+$EXE -t=700 -h=4080  -v=1 -prec=fp16 -repeat=1000
\ No newline at end of file
diff --git a/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh b/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh
new file mode 100755
index 000000000..3bb62d37b
--- /dev/null
+++ b/example/ck_tile/14_moe_smoothquant/script/smoke_test.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+EXE=build/bin/tile_example_moe_smoothquant
+
+for pr_i in "fp16" "bf16" ; do
+$EXE -prec=$pr_i -t=99  -h=13
+$EXE -prec=$pr_i -t=17  -h=16
+$EXE -prec=$pr_i -t=1   -h=100
+$EXE -prec=$pr_i -t=4   -h=128
+$EXE -prec=$pr_i -t=80  -h=127
+$EXE -prec=$pr_i -t=22  -h=255 -stride=256
+$EXE -prec=$pr_i -t=7   -h=599
+$EXE -prec=$pr_i -t=19  -h=512
+$EXE -prec=$pr_i -t=33  -h=313 -stride=1000
+$EXE -prec=$pr_i -t=11  -h=510
+$EXE -prec=$pr_i -t=171 -h=676 -stride=818
+$EXE -prec=$pr_i -t=91  -h=636
+$EXE -prec=$pr_i -t=12  -h=768 -stride=800
+$EXE -prec=$pr_i -t=100 -h=766 -stride=812
+$EXE -prec=$pr_i -t=31  -h=1024
+$EXE -prec=$pr_i -t=64  -h=1000 -stride=1004
+$EXE -prec=$pr_i -t=8   -h=1501
+$EXE -prec=$pr_i -t=3   -h=1826
+$EXE -prec=$pr_i -t=5   -h=2040
+$EXE -prec=$pr_i -t=7   -h=2734
+$EXE -prec=$pr_i -t=1   -h=3182
+$EXE -prec=$pr_i -t=9   -h=4096
+$EXE -prec=$pr_i -t=3   -h=8192
+$EXE -prec=$pr_i -t=1   -h=10547
+$EXE -prec=$pr_i -t=3   -h=17134
+done
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 15db0f46c..b6a44f76b 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -13,3 +13,4 @@ add_subdirectory(10_rmsnorm2d)
 add_subdirectory(11_add_rmsnorm2d_rdquant)
 add_subdirectory(12_smoothquant)
 add_subdirectory(13_moe_sorting)
+add_subdirectory(14_moe_smoothquant)
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 604c9551f..a15d2c040 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -64,6 +64,7 @@
 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1
 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2
 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3
+#define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4
 
 #ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT
 #define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE
@@ -225,3 +226,7 @@
 #ifndef CK_TILE_WORKAROUND_SWDEV_383542
 #define CK_TILE_WORKAROUND_SWDEV_383542 1
 #endif
+
+#ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+#define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1
+#endif
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index 5f4b64466..499ba80a8 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -18,6 +18,7 @@ enum class bf16_rounding_mode
     truncate_with_nan,
     truncate,
     standard_asm,
+    rta_asm, // round to nearest away
 };
 
 template <bf16_rounding_mode rounding =
@@ -180,6 +181,39 @@ uint16_t float_to_bf16_rtn_asm(float f)
     return uint16_t(u.int32);
 }
 
+// TODO: do we need this on host?
+CK_TILE_HOST
+uint16_t float_to_bf16_rta_asm(float f) { return float_to_bf16_rtn_raw(f); }
+
+CK_TILE_DEVICE
+uint16_t float_to_bf16_rta_asm(float f)
+{
+    union
+    {
+        float fp32;
+        struct
+        {
+            uint16_t lo;
+            uint16_t hi;
+        };
+    } u = {f};
+
+    const uint32_t low_nan = 0x7fff;
+    const uint32_t hi_nan  = 0x7fff0000;
+
+    using uint32x2_t = uint32_t __attribute__((ext_vector_type(2)));
+    uint32x2_t check_nan;
+
+    asm volatile("v_cmp_u_f32 %[s_cnan], %[v_x], %[v_x] \n"
+                 "v_add3_u32 %[v_x], %[v_x], %[v_blo], 1 \n"
+                 "v_cndmask_b32 %[v_x], %[v_x], %[v_bhi], %[s_cnan]"
+                 : [s_cnan] "+s"(check_nan), [v_x] "+v"(u.fp32)
+                 : [v_blo] "v"(low_nan), [v_bhi] "v"(hi_nan));
+
+    // Note: in above code snipet, we use hi 16 bit
+    return u.hi;
+}
+
 // Truncate instead of rounding, preserving SNaN
 CK_TILE_HOST_DEVICE
 constexpr uint16_t float_to_bf16_truc_nan_raw(float f)
@@ -213,6 +247,8 @@ CK_TILE_HOST_DEVICE constexpr uint16_t float_to_bf16_raw(float f, constant<round
         return float_to_bf16_rtn_asm(f);
     else if constexpr(rounding == bf16_rounding_mode::truncate_with_nan)
         return float_to_bf16_truc_nan_raw(f);
+    else if constexpr(rounding == bf16_rounding_mode::rta_asm)
+        return float_to_bf16_rta_asm(f);
     else
         return float_to_bf16_truc_raw(f);
 }
diff --git a/include/ck_tile/host/reference/reference_moe_sorting.hpp b/include/ck_tile/host/reference/reference_moe_sorting.hpp
index c8eb7edb5..3851629cc 100644
--- a/include/ck_tile/host/reference/reference_moe_sorting.hpp
+++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp
@@ -8,6 +8,9 @@
 
 namespace ck_tile {
 
+#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
+    static_cast<uint32_t>(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24))
+
 template <typename WeightType, typename IndexType = index_t>
 CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
                                         const HostTensor<WeightType>& weights,
@@ -20,8 +23,14 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
 {
     const index_t num_token = topk_ids.mDesc.get_lengths()[0];
     const index_t topk      = topk_ids.mDesc.get_lengths()[1];
-    std::vector<std::vector<IndexType>> expert_tokens(experts,
-                                                      std::vector<IndexType>(unit_size, num_token));
+    // allocate a temp buffer, and fill the value with [number_token|topk]
+    std::vector<std::vector<IndexType>> expert_tokens(
+        experts,
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+        std::vector<IndexType>(unit_size, MOE_SORTING_MOCK_ID(num_token, topk)));
+#else
+        std::vector<IndexType>(unit_size, num_token));
+#endif
     std::vector<std::vector<WeightType>> expert_token_weights(
         experts, std::vector<WeightType>(unit_size, 0));
     std::vector<IndexType> expert_slices(experts, 1);
@@ -42,12 +51,19 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
                 expert_token_weights[e].resize(new_size);
                 for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
                 {
-                    expert_tokens[e][i]        = num_token;
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                    expert_tokens[e][i] = MOE_SORTING_MOCK_ID(num_token, topk);
+#else
+                    expert_tokens[e][i] = num_token;
+#endif
                     expert_token_weights[e][i] = 0;
                 }
             }
-
-            expert_tokens[e][idx]        = t;
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+            expert_tokens[e][idx] = MOE_SORTING_MOCK_ID(t, k);
+#else
+            expert_tokens[e][idx] = t;
+#endif
             expert_token_weights[e][idx] = w;
             expert_slice_idxs[e]++;
         }
@@ -75,4 +91,7 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
     unit_cnt *= unit_size;
     return;
 }
+
+#undef MOE_SORTING_MOCK_ID
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 1c6acec70..d9e28ceb5 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -12,20 +12,77 @@
 
 namespace ck_tile {
 
+#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
+    static_cast<uint32_t>(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24))
+
+// clang-format off
+// [indexing implementation-1]
+// using M_a as constexpr block_size to partition all tokens into different slices
+// each slice map to one expert, and one expert can have multiple slices
+// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number)
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+//
+// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
+// * this could be larger than actual, since actual tokens are on GPU
+//
+// sorted_token_ids_ptr   : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5]
+//                          |-  exp-0  -|-  exp-1  -|-  exp-2  -|-      exp-3          -|-  exp-4 -|-  exp-5  -|
+// sorted_weight_ptr      : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o]
+//
+// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
+//
+// * Note on token_id_per_expert/sorted_token_ids_ptr data:
+// currently we do not have topk information from the data of token_id_per_expert/sorted_token_ids_ptr.
+// In some cases(like smooth-quant), we need topk information to indexing into tokens quant from 
+// different expert smooth quant. So we modify the number stored inside token_id_per_expert/sorted_token_ids_ptr
+//
+//       32bit    0........23 24.....31 bit
+//      (data) -> (token_id | topk_id)
+// low 24 bit is for token id, top 8 bit is for topk id
+//
+// the input after smooth-quant is [topk, token, hidden_dim], originally it is [token, hidden_dim]
+// the input scale for token is [topk, token, 1], the smooth-quant scale for first gemm is [expert, interm_dim]
+//
+// sorted_expert_ids_ptr  : [0, 1, 2, 3, 3, 4, 5]
+// * length is (max_num_tokens_padded + block_size - 1) / block_size
+//
+// num_tokens_post_padded_ptr : [28]
+// num_sorted_tiles_ptr : [7]
+//
+// * different from vLLM
+//   1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id
+//   2）need sorted_weight_ptr
+//   3) use num_sorted_tiles_ptr, already divided by M_a
+//
+// * below used for indexing
+//  1) sorted_token_ids_ptr [max_num_tokens_padded]
+//  2) sorted_weight_ptr
+//  3) sorted_expert_ids_ptr
+//  4）num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one)
+//
+//   max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1)
 struct MoeSortingHostArgs
 {
-    const void* p_topk_ids;
-    const void* p_weights;
+    const void* p_topk_ids;     // [token, topk]
+    const void* p_weights;      // [token, topk]
     void* p_sorted_token_ids;
     void* p_sorted_weights;
     void* p_sorted_expert_ids;
     void* p_total_tokens_post_pad;
+    // we fused the setzero of output of fused-moe buffer
+    // set this pointer to nullptr will skip this operation
     void* p_moe_buf;
     index_t tokens;
-    index_t unit_size;
+    index_t unit_size;      // this is the M_a of fused-moe kernel
     index_t num_experts;
     index_t topk;
-    index_t moe_buf_bytes;
+    index_t moe_buf_bytes;  // byte size of p_moe_buf
 };
 
 template <typename Problem_>
@@ -183,8 +240,14 @@ struct MoeSortingKernel
             index_t expert_id = topk_id[i];
             index_t rank_post_pad =
                 tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id];
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+            uint32_t curr_token_id, curr_topk_id;
+            topk_mdiv.divmod(i, curr_token_id, curr_topk_id);
+            p_sorted_token_ids[rank_post_pad] = MOE_SORTING_MOCK_ID(curr_token_id, curr_topk_id);
+#else
             p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i);
-            p_sorted_weights[rank_post_pad]   = weights[i];
+#endif
+            p_sorted_weights[rank_post_pad] = weights[i];
             ++tokens_cnts[calc_index(num_experts, tid, expert_id)];
         }
 
@@ -195,8 +258,13 @@ struct MoeSortingKernel
                 cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)];
             while(expert_offset < cumsum[tid + 1])
             {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                p_sorted_token_ids[expert_offset] =
+                    MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor);
+#else
                 p_sorted_token_ids[expert_offset] = prefill_token;
-                p_sorted_weights[expert_offset]   = static_cast<WeightType>(0.0);
+#endif
+                p_sorted_weights[expert_offset] = static_cast<WeightType>(0.0);
                 expert_offset++;
             }
         }
@@ -229,4 +297,7 @@ struct MoeSortingKernel
                                            smem);
     }
 };
+
+#undef MOE_SORTING_MOCK_ID
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/smoothquant.hpp b/include/ck_tile/ops/smoothquant.hpp
index c9e459765..24a59b45b 100644
--- a/include/ck_tile/ops/smoothquant.hpp
+++ b/include/ck_tile/ops/smoothquant.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp"
 #include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp"
 #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp"
 #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp"
diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
new file mode 100644
index 000000000..1bece521f
--- /dev/null
+++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+// host side args
+struct MoeSmoothquantHostArgs
+{
+    const void* p_x;        // [tokens ,hidden_size], input, fp16/bf16
+    const void* p_xscale;   // [experts, hidden_size], input, columnwise scale, fp32
+    const void* p_topk_ids; // [tokens, topk]
+
+    void* p_yscale; // [topk * tokens,  1], output, rowwise quant scale
+    void* p_qy;     // [topk * tokens, hidden_size], output
+
+    index_t tokens;
+    index_t hidden_size;
+    index_t experts;
+    index_t topk;
+    index_t x_stride; // input x row stride
+    index_t y_stride; // output y stride(stride for topk)
+};
+
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct MoeSmoothquant
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using YScaleDataType  = remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = remove_cvref_t<typename Problem::QYDataType>;
+
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kTwoPass   = Problem::kTwoPass;
+
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    static_assert(Problem::BlockShape::Repeat_M == 1);
+
+    struct Kargs
+    {
+        const void* p_x;        // [tokens ,hidden_size], input, fp16/bf16
+        const void* p_xscale;   // [experts, hidden_size], input, columnwise scale, fp32
+        const void* p_topk_ids; // [tokens, topk]
+
+        void* p_yscale; // [topk, tokens, 1], output, rowwise quant scale
+        void* p_qy;     // [topk, tokens, hidden_size], output
+
+        index_t tokens;
+        index_t hidden_size;
+        index_t experts;
+        index_t topk;
+        index_t x_stride; // input x row stride
+        index_t y_stride; // output y stride(stride for topk)
+    };
+    using Hargs = MoeSmoothquantHostArgs;
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        return Kargs{hargs.p_x,
+                     hargs.p_xscale,
+                     hargs.p_topk_ids,
+                     hargs.p_yscale,
+                     hargs.p_qy,
+                     hargs.tokens,
+                     hargs.hidden_size,
+                     hargs.experts,
+                     hargs.topk,
+                     hargs.x_stride,
+                     hargs.y_stride};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        return dim3(hargs.topk, integer_divide_ceil(hargs.tokens, Block_M), 1);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kTwoPass) n += "_2p";
+            return n; }();
+
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("moe_smoothquant_") + _SS_(t2s<XDataType>::name) + "_" +
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const index_t i_topk  = blockIdx.x;
+        const index_t i_token = blockIdx.y * Block_M;
+        const index_t i_token_in_thrd =
+            __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
+
+        const index_t i_expert = reinterpret_cast<const index_t*>(
+            kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk];
+
+        // [tokens ,hidden_size]
+        const auto x_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XDataType*>(kargs.p_x),
+                make_tuple(kargs.tokens, kargs.hidden_size),
+                make_tuple(kargs.x_stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {i_token, 0});
+        }();
+
+        // [experts, hidden_size],
+        const auto xscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XScaleDataType*>(kargs.p_xscale) + i_expert * kargs.hidden_size,
+                make_tuple(kargs.hidden_size),
+                make_tuple(1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+        }();
+
+        // [topk, tokens]
+        auto yscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YScaleDataType*>(kargs.p_yscale) + i_topk * kargs.tokens,
+                make_tuple(kargs.tokens),
+                make_tuple(1),
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_M>{}), {i_token});
+        }();
+
+        // [topk, tokens, hidden_size]
+        auto qy_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<QYDataType*>(kargs.p_qy) + i_topk * kargs.tokens * kargs.y_stride,
+                make_tuple(kargs.tokens, kargs.hidden_size),
+                make_tuple(kargs.y_stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {i_token, 0});
+        }();
+
+        __shared__ char smem[GetSmemSize()];
+
+        Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.hidden_size, smem);
+    }
+};
+
+} // namespace ck_tile
-- 
GitLab


From c2bcbb1379c31a068234216a585027a91be57fee Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 24 Nov 2024 21:41:52 -0800
Subject: [PATCH 076/153] Bump rocm-docs-core from 1.8.5 to 1.9.0 in
 /docs/sphinx (#1691)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.5 to 1.9.0.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.9.0/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.5...v1.9.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 3a2e266ef..5bec504a0 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.8.5
+rocm-docs-core==1.9.0
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index b65d2391f..8881c0e74 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.8.5
+rocm-docs-core==1.9.0
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From 645fe812f65db86a9eaca7ae00e0004c1634bc0a Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Mon, 25 Nov 2024 15:30:35 +0800
Subject: [PATCH 077/153] [CK_TILE] Fix fMHA fwd MakeKargs() compilation errors
 (#1689)

* Fix mis-matched tuple<> elem types

* Rename MakeKargs() as MakeKargsImpl()

---------

Co-authored-by: Qianfeng <qianfeng.zhang@amd.com>
---
 example/ck_tile/01_fmha/fmha_bwd.hpp          | 208 +++++-----
 example/ck_tile/01_fmha/fmha_fwd.hpp          | 156 ++++----
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       | 232 +++++------
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       | 370 +++++++++---------
 4 files changed, 484 insertions(+), 482 deletions(-)

diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 3b21a3257..722ef15a2 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -150,113 +150,113 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
         // create group mode kernel arguments
         if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
         {
-            return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr,
-                                                  args.k_ptr,
-                                                  args.v_ptr,
-                                                  args.bias_ptr,
-                                                  args.lse_ptr,
-                                                  args.do_ptr,
-                                                  args.d_ptr,
-                                                  args.rand_val_ptr,
-                                                  args.dk_ptr,
-                                                  args.dv_ptr,
-                                                  args.dbias_ptr,
-                                                  args.dq_acc_ptr,
-                                                  args.seqstart_q_ptr,
-                                                  args.seqstart_k_ptr,
-                                                  args.seqlen_k_ptr,
-                                                  args.hdim_q,
-                                                  args.hdim_v,
-                                                  args.nhead_q,
-                                                  args.nhead_q / args.nhead_k,
-                                                  args.scale,
-                                                  args.stride_q,
-                                                  args.stride_k,
-                                                  args.stride_v,
-                                                  args.stride_bias,
-                                                  args.stride_randval,
-                                                  args.stride_do,
-                                                  args.stride_dq_acc,
-                                                  args.stride_dk,
-                                                  args.stride_dv,
-                                                  args.stride_dbias,
-                                                  args.nhead_stride_q,
-                                                  args.nhead_stride_k,
-                                                  args.nhead_stride_v,
-                                                  args.nhead_stride_bias,
-                                                  args.nhead_stride_randval,
-                                                  args.nhead_stride_do,
-                                                  args.nhead_stride_lsed,
-                                                  args.nhead_stride_dq_acc,
-                                                  args.nhead_stride_dk,
-                                                  args.nhead_stride_dv,
-                                                  args.nhead_stride_dbias,
-                                                  args.split_stride_dq_acc,
-                                                  args.window_size_left,
-                                                  args.window_size_right,
-                                                  args.mask_type,
-                                                  args.p_drop,
-                                                  args.drop_seed_offset);
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
+                                                      args.k_ptr,
+                                                      args.v_ptr,
+                                                      args.bias_ptr,
+                                                      args.lse_ptr,
+                                                      args.do_ptr,
+                                                      args.d_ptr,
+                                                      args.rand_val_ptr,
+                                                      args.dk_ptr,
+                                                      args.dv_ptr,
+                                                      args.dbias_ptr,
+                                                      args.dq_acc_ptr,
+                                                      args.seqstart_q_ptr,
+                                                      args.seqstart_k_ptr,
+                                                      args.seqlen_k_ptr,
+                                                      args.hdim_q,
+                                                      args.hdim_v,
+                                                      args.nhead_q,
+                                                      args.nhead_q / args.nhead_k,
+                                                      args.scale,
+                                                      args.stride_q,
+                                                      args.stride_k,
+                                                      args.stride_v,
+                                                      args.stride_bias,
+                                                      args.stride_randval,
+                                                      args.stride_do,
+                                                      args.stride_dq_acc,
+                                                      args.stride_dk,
+                                                      args.stride_dv,
+                                                      args.stride_dbias,
+                                                      args.nhead_stride_q,
+                                                      args.nhead_stride_k,
+                                                      args.nhead_stride_v,
+                                                      args.nhead_stride_bias,
+                                                      args.nhead_stride_randval,
+                                                      args.nhead_stride_do,
+                                                      args.nhead_stride_lsed,
+                                                      args.nhead_stride_dq_acc,
+                                                      args.nhead_stride_dk,
+                                                      args.nhead_stride_dv,
+                                                      args.nhead_stride_dbias,
+                                                      args.split_stride_dq_acc,
+                                                      args.window_size_left,
+                                                      args.window_size_right,
+                                                      args.mask_type,
+                                                      args.p_drop,
+                                                      args.drop_seed_offset);
         }
         else
         { // create batch mode kernel arguments
-            return FmhaBwdDQDKDVKernel::MakeKargs(args.q_ptr,
-                                                  args.k_ptr,
-                                                  args.v_ptr,
-                                                  args.bias_ptr,
-                                                  args.lse_ptr,
-                                                  args.do_ptr,
-                                                  args.d_ptr,
-                                                  args.rand_val_ptr,
-                                                  args.dk_ptr,
-                                                  args.dv_ptr,
-                                                  args.dbias_ptr,
-                                                  args.dq_acc_ptr,
-                                                  args.seqlen_q,
-                                                  args.seqlen_k,
-                                                  args.hdim_q,
-                                                  args.hdim_v,
-                                                  args.nhead_q,
-                                                  args.nhead_q / args.nhead_k,
-                                                  args.scale,
-                                                  args.stride_q,
-                                                  args.stride_k,
-                                                  args.stride_v,
-                                                  args.stride_bias,
-                                                  args.stride_randval,
-                                                  args.stride_do,
-                                                  args.stride_dq_acc,
-                                                  args.stride_dk,
-                                                  args.stride_dv,
-                                                  args.stride_dbias,
-                                                  args.nhead_stride_q,
-                                                  args.nhead_stride_k,
-                                                  args.nhead_stride_v,
-                                                  args.nhead_stride_bias,
-                                                  args.nhead_stride_randval,
-                                                  args.nhead_stride_do,
-                                                  args.nhead_stride_lsed,
-                                                  args.nhead_stride_dq_acc,
-                                                  args.nhead_stride_dk,
-                                                  args.nhead_stride_dv,
-                                                  args.nhead_stride_dbias,
-                                                  args.batch_stride_q,
-                                                  args.batch_stride_k,
-                                                  args.batch_stride_v,
-                                                  args.batch_stride_bias,
-                                                  args.batch_stride_randval,
-                                                  args.batch_stride_do,
-                                                  args.batch_stride_lsed,
-                                                  args.batch_stride_dq_acc,
-                                                  args.batch_stride_dk,
-                                                  args.batch_stride_dv,
-                                                  args.batch_stride_dbias,
-                                                  args.split_stride_dq_acc,
-                                                  args.window_size_left,
-                                                  args.window_size_right,
-                                                  args.mask_type,
-                                                  args.p_drop,
-                                                  args.drop_seed_offset);
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
+                                                      args.k_ptr,
+                                                      args.v_ptr,
+                                                      args.bias_ptr,
+                                                      args.lse_ptr,
+                                                      args.do_ptr,
+                                                      args.d_ptr,
+                                                      args.rand_val_ptr,
+                                                      args.dk_ptr,
+                                                      args.dv_ptr,
+                                                      args.dbias_ptr,
+                                                      args.dq_acc_ptr,
+                                                      args.seqlen_q,
+                                                      args.seqlen_k,
+                                                      args.hdim_q,
+                                                      args.hdim_v,
+                                                      args.nhead_q,
+                                                      args.nhead_q / args.nhead_k,
+                                                      args.scale,
+                                                      args.stride_q,
+                                                      args.stride_k,
+                                                      args.stride_v,
+                                                      args.stride_bias,
+                                                      args.stride_randval,
+                                                      args.stride_do,
+                                                      args.stride_dq_acc,
+                                                      args.stride_dk,
+                                                      args.stride_dv,
+                                                      args.stride_dbias,
+                                                      args.nhead_stride_q,
+                                                      args.nhead_stride_k,
+                                                      args.nhead_stride_v,
+                                                      args.nhead_stride_bias,
+                                                      args.nhead_stride_randval,
+                                                      args.nhead_stride_do,
+                                                      args.nhead_stride_lsed,
+                                                      args.nhead_stride_dq_acc,
+                                                      args.nhead_stride_dk,
+                                                      args.nhead_stride_dv,
+                                                      args.nhead_stride_dbias,
+                                                      args.batch_stride_q,
+                                                      args.batch_stride_k,
+                                                      args.batch_stride_v,
+                                                      args.batch_stride_bias,
+                                                      args.batch_stride_randval,
+                                                      args.batch_stride_do,
+                                                      args.batch_stride_lsed,
+                                                      args.batch_stride_dq_acc,
+                                                      args.batch_stride_dk,
+                                                      args.batch_stride_dv,
+                                                      args.batch_stride_dbias,
+                                                      args.split_stride_dq_acc,
+                                                      args.window_size_left,
+                                                      args.window_size_right,
+                                                      args.mask_type,
+                                                      args.p_drop,
+                                                      args.drop_seed_offset);
         }
     }();
 
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 41edac67b..704453baa 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -281,87 +281,87 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
         // create group mode kernel arguments
         if constexpr(FmhaKernel::kIsGroupMode)
         {
-            return FmhaKernel::MakeKargs(args.q_ptr,
-                                         args.k_ptr,
-                                         args.v_ptr,
-                                         args.bias_ptr,
-                                         args.rand_val_ptr,
-                                         args.lse_ptr,
-                                         args.o_ptr,
-                                         args.seqstart_q_ptr,
-                                         args.seqstart_k_ptr,
-                                         args.seqlen_k_ptr,
-                                         args.hdim_q,
-                                         args.hdim_v,
-                                         args.nhead_q,
-                                         args.nhead_q / args.nhead_k,
-                                         args.scale_s,
-                                         args.scale_p,
-                                         args.scale_o,
-                                         args.stride_q,
-                                         args.stride_k,
-                                         args.stride_v,
-                                         args.stride_bias,
-                                         args.stride_randval,
-                                         args.stride_o,
-                                         args.nhead_stride_q,
-                                         args.nhead_stride_k,
-                                         args.nhead_stride_v,
-                                         args.nhead_stride_bias,
-                                         args.nhead_stride_randval,
-                                         args.nhead_stride_lse,
-                                         args.nhead_stride_o,
-                                         args.window_size_left,
-                                         args.window_size_right,
-                                         args.mask_type,
-                                         args.p_drop,
-                                         args.s_randval,
-                                         args.drop_seed_offset);
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
+                                             args.k_ptr,
+                                             args.v_ptr,
+                                             args.bias_ptr,
+                                             args.rand_val_ptr,
+                                             args.lse_ptr,
+                                             args.o_ptr,
+                                             args.seqstart_q_ptr,
+                                             args.seqstart_k_ptr,
+                                             args.seqlen_k_ptr,
+                                             args.hdim_q,
+                                             args.hdim_v,
+                                             args.nhead_q,
+                                             args.nhead_q / args.nhead_k,
+                                             args.scale_s,
+                                             args.scale_p,
+                                             args.scale_o,
+                                             args.stride_q,
+                                             args.stride_k,
+                                             args.stride_v,
+                                             args.stride_bias,
+                                             args.stride_randval,
+                                             args.stride_o,
+                                             args.nhead_stride_q,
+                                             args.nhead_stride_k,
+                                             args.nhead_stride_v,
+                                             args.nhead_stride_bias,
+                                             args.nhead_stride_randval,
+                                             args.nhead_stride_lse,
+                                             args.nhead_stride_o,
+                                             args.window_size_left,
+                                             args.window_size_right,
+                                             args.mask_type,
+                                             args.p_drop,
+                                             args.s_randval,
+                                             args.drop_seed_offset);
         }
         else
         { // create batch mode kernel arguments
-            return FmhaKernel::MakeKargs(args.q_ptr,
-                                         args.k_ptr,
-                                         args.v_ptr,
-                                         args.bias_ptr,
-                                         args.rand_val_ptr,
-                                         args.lse_ptr,
-                                         args.o_ptr,
-                                         args.seqlen_q,
-                                         args.seqlen_k,
-                                         args.hdim_q,
-                                         args.hdim_v,
-                                         args.nhead_q,
-                                         args.nhead_q / args.nhead_k,
-                                         args.scale_s,
-                                         args.scale_p,
-                                         args.scale_o,
-                                         args.stride_q,
-                                         args.stride_k,
-                                         args.stride_v,
-                                         args.stride_bias,
-                                         args.stride_randval,
-                                         args.stride_o,
-                                         args.nhead_stride_q,
-                                         args.nhead_stride_k,
-                                         args.nhead_stride_v,
-                                         args.nhead_stride_bias,
-                                         args.nhead_stride_randval,
-                                         args.nhead_stride_lse,
-                                         args.nhead_stride_o,
-                                         args.batch_stride_q,
-                                         args.batch_stride_k,
-                                         args.batch_stride_v,
-                                         args.batch_stride_bias,
-                                         args.batch_stride_randval,
-                                         args.batch_stride_lse,
-                                         args.batch_stride_o,
-                                         args.window_size_left,
-                                         args.window_size_right,
-                                         args.mask_type,
-                                         args.p_drop,
-                                         args.s_randval,
-                                         args.drop_seed_offset);
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
+                                             args.k_ptr,
+                                             args.v_ptr,
+                                             args.bias_ptr,
+                                             args.rand_val_ptr,
+                                             args.lse_ptr,
+                                             args.o_ptr,
+                                             args.seqlen_q,
+                                             args.seqlen_k,
+                                             args.hdim_q,
+                                             args.hdim_v,
+                                             args.nhead_q,
+                                             args.nhead_q / args.nhead_k,
+                                             args.scale_s,
+                                             args.scale_p,
+                                             args.scale_o,
+                                             args.stride_q,
+                                             args.stride_k,
+                                             args.stride_v,
+                                             args.stride_bias,
+                                             args.stride_randval,
+                                             args.stride_o,
+                                             args.nhead_stride_q,
+                                             args.nhead_stride_k,
+                                             args.nhead_stride_v,
+                                             args.nhead_stride_bias,
+                                             args.nhead_stride_randval,
+                                             args.nhead_stride_lse,
+                                             args.nhead_stride_o,
+                                             args.batch_stride_q,
+                                             args.batch_stride_k,
+                                             args.batch_stride_v,
+                                             args.batch_stride_bias,
+                                             args.batch_stride_randval,
+                                             args.batch_stride_lse,
+                                             args.batch_stride_o,
+                                             args.window_size_left,
+                                             args.window_size_right,
+                                             args.mask_type,
+                                             args.p_drop,
+                                             args.s_randval,
+                                             args.drop_seed_offset);
         }
     }();
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index ccf15ee60..23174528e 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -304,64 +304,64 @@ struct FmhaBwdDQDKDVKernel
 
     template <bool Cond = !kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              const void* lse_ptr,
-              const void* do_ptr,
-              const void* d_ptr,
-              void* rand_val_ptr,
-              void* dk_ptr,
-              void* dv_ptr,
-              void* dbias_ptr,
-              void* dq_acc_ptr,
-              ck_tile::index_t seqlen_q,
-              ck_tile::index_t seqlen_k,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_do,
-              ck_tile::index_t stride_dq_acc,
-              ck_tile::index_t stride_dk,
-              ck_tile::index_t stride_dv,
-              ck_tile::index_t stride_dbias,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_do,
-              ck_tile::index_t nhead_stride_lsed,
-              ck_tile::index_t nhead_stride_dq_acc,
-              ck_tile::index_t nhead_stride_dk,
-              ck_tile::index_t nhead_stride_dv,
-              ck_tile::index_t nhead_stride_dbias,
-              ck_tile::index_t batch_stride_q,
-              ck_tile::index_t batch_stride_k,
-              ck_tile::index_t batch_stride_v,
-              ck_tile::index_t batch_stride_bias,
-              ck_tile::index_t batch_stride_randval,
-              ck_tile::index_t batch_stride_do,
-              ck_tile::index_t batch_stride_lsed,
-              ck_tile::index_t batch_stride_dq_acc,
-              ck_tile::index_t batch_stride_dk,
-              ck_tile::index_t batch_stride_dv,
-              ck_tile::index_t batch_stride_dbias,
-              ck_tile::index_t split_stride_dq_acc,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                  drop_seed_offset)
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  const void* lse_ptr,
+                  const void* do_ptr,
+                  const void* d_ptr,
+                  void* rand_val_ptr,
+                  void* dk_ptr,
+                  void* dv_ptr,
+                  void* dbias_ptr,
+                  void* dq_acc_ptr,
+                  ck_tile::index_t seqlen_q,
+                  ck_tile::index_t seqlen_k,
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  float scale,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_randval,
+                  ck_tile::index_t stride_do,
+                  ck_tile::index_t stride_dq_acc,
+                  ck_tile::index_t stride_dk,
+                  ck_tile::index_t stride_dv,
+                  ck_tile::index_t stride_dbias,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_randval,
+                  ck_tile::index_t nhead_stride_do,
+                  ck_tile::index_t nhead_stride_lsed,
+                  ck_tile::index_t nhead_stride_dq_acc,
+                  ck_tile::index_t nhead_stride_dk,
+                  ck_tile::index_t nhead_stride_dv,
+                  ck_tile::index_t nhead_stride_dbias,
+                  ck_tile::index_t batch_stride_q,
+                  ck_tile::index_t batch_stride_k,
+                  ck_tile::index_t batch_stride_v,
+                  ck_tile::index_t batch_stride_bias,
+                  ck_tile::index_t batch_stride_randval,
+                  ck_tile::index_t batch_stride_do,
+                  ck_tile::index_t batch_stride_lsed,
+                  ck_tile::index_t batch_stride_dq_acc,
+                  ck_tile::index_t batch_stride_dk,
+                  ck_tile::index_t batch_stride_dv,
+                  ck_tile::index_t batch_stride_dbias,
+                  ck_tile::index_t split_stride_dq_acc,
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type,
+                  float p_drop,
+                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                      drop_seed_offset)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -470,7 +470,7 @@ struct FmhaBwdDQDKDVKernel
         return kargs;
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = !kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
@@ -531,7 +531,7 @@ struct FmhaBwdDQDKDVKernel
               float p_drop,
               const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
     {
-        return MakeKargs(
+        return MakeKargsImpl(
             q_ptr,
             k_ptr,
             v_ptr,
@@ -591,7 +591,7 @@ struct FmhaBwdDQDKDVKernel
             std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = !kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
@@ -650,9 +650,9 @@ struct FmhaBwdDQDKDVKernel
               ck_tile::index_t window_size_right,
               ck_tile::index_t mask_type,
               float p_drop,
-              const std::tuple<void*, void*>& drop_seed_offset)
+              const std::tuple<const void*, const void*>& drop_seed_offset)
     {
-        return MakeKargs(
+        return MakeKargsImpl(
             q_ptr,
             k_ptr,
             v_ptr,
@@ -714,54 +714,54 @@ struct FmhaBwdDQDKDVKernel
 
     template <bool Cond = kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              const void* lse_ptr,
-              const void* do_ptr,
-              const void* d_ptr,
-              void* rand_val_ptr,
-              void* dk_ptr,
-              void* dv_ptr,
-              void* dbias_ptr,
-              void* dq_acc_ptr,
-              const void* seqstart_q_ptr,
-              const void* seqstart_k_ptr,
-              const void* seqlen_k_ptr,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_do,
-              ck_tile::index_t stride_dq_acc,
-              ck_tile::index_t stride_dk,
-              ck_tile::index_t stride_dv,
-              ck_tile::index_t stride_dbias,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_do,
-              ck_tile::index_t nhead_stride_lsed,
-              ck_tile::index_t nhead_stride_dq_acc,
-              ck_tile::index_t nhead_stride_dk,
-              ck_tile::index_t nhead_stride_dv,
-              ck_tile::index_t nhead_stride_dbias,
-              ck_tile::index_t split_stride_dq_acc,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                  drop_seed_offset)
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  const void* lse_ptr,
+                  const void* do_ptr,
+                  const void* d_ptr,
+                  void* rand_val_ptr,
+                  void* dk_ptr,
+                  void* dv_ptr,
+                  void* dbias_ptr,
+                  void* dq_acc_ptr,
+                  const void* seqstart_q_ptr,
+                  const void* seqstart_k_ptr,
+                  const void* seqlen_k_ptr,
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  float scale,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_randval,
+                  ck_tile::index_t stride_do,
+                  ck_tile::index_t stride_dq_acc,
+                  ck_tile::index_t stride_dk,
+                  ck_tile::index_t stride_dv,
+                  ck_tile::index_t stride_dbias,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_randval,
+                  ck_tile::index_t nhead_stride_do,
+                  ck_tile::index_t nhead_stride_lsed,
+                  ck_tile::index_t nhead_stride_dq_acc,
+                  ck_tile::index_t nhead_stride_dk,
+                  ck_tile::index_t nhead_stride_dv,
+                  ck_tile::index_t nhead_stride_dbias,
+                  ck_tile::index_t split_stride_dq_acc,
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type,
+                  float p_drop,
+                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                      drop_seed_offset)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -858,7 +858,7 @@ struct FmhaBwdDQDKDVKernel
         return kargs;
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
@@ -909,7 +909,7 @@ struct FmhaBwdDQDKDVKernel
               float p_drop,
               const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
     {
-        return MakeKargs(
+        return MakeKargsImpl(
             q_ptr,
             k_ptr,
             v_ptr,
@@ -959,7 +959,7 @@ struct FmhaBwdDQDKDVKernel
             std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
@@ -1008,9 +1008,9 @@ struct FmhaBwdDQDKDVKernel
               ck_tile::index_t window_size_right,
               ck_tile::index_t mask_type,
               float p_drop,
-              const std::tuple<void*, void*>& drop_seed_offset)
+              const std::tuple<const void*, const void*>& drop_seed_offset)
     {
-        return MakeKargs(
+        return MakeKargsImpl(
             q_ptr,
             k_ptr,
             v_ptr,
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 4443a4503..3de433d6a 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -64,7 +64,7 @@ struct FmhaFwdKernel
     template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
     // clang-format on
 
-    __host__ static std::string GetName()
+    CK_TILE_HOST static std::string GetName()
     {
         // sync with generate.py
         // clang-format off
@@ -267,50 +267,50 @@ struct FmhaFwdKernel
     using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
 
     template <bool Cond = !kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              void* rand_val_ptr,
-              void* lse_ptr,
-              void* o_ptr,
-              ck_tile::index_t seqlen_q,
-              ck_tile::index_t seqlen_k,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale_s,
-              float scale_p,
-              float scale_o,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_o,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_lse,
-              ck_tile::index_t nhead_stride_o,
-              ck_tile::index_t batch_stride_q,
-              ck_tile::index_t batch_stride_k,
-              ck_tile::index_t batch_stride_v,
-              ck_tile::index_t batch_stride_bias,
-              ck_tile::index_t batch_stride_randval,
-              ck_tile::index_t batch_stride_lse,
-              ck_tile::index_t batch_stride_o,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              bool s_randval,
-              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                  drop_seed_offset)
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  void* rand_val_ptr,
+                  void* lse_ptr,
+                  void* o_ptr,
+                  ck_tile::index_t seqlen_q,
+                  ck_tile::index_t seqlen_k,
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  float scale_s,
+                  float scale_p,
+                  float scale_o,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_randval,
+                  ck_tile::index_t stride_o,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_randval,
+                  ck_tile::index_t nhead_stride_lse,
+                  ck_tile::index_t nhead_stride_o,
+                  ck_tile::index_t batch_stride_q,
+                  ck_tile::index_t batch_stride_k,
+                  ck_tile::index_t batch_stride_v,
+                  ck_tile::index_t batch_stride_bias,
+                  ck_tile::index_t batch_stride_randval,
+                  ck_tile::index_t batch_stride_lse,
+                  ck_tile::index_t batch_stride_o,
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type,
+                  float p_drop,
+                  bool s_randval,
+                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                      drop_seed_offset)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -399,9 +399,9 @@ struct FmhaFwdKernel
         return kargs;
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = !kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
@@ -445,53 +445,54 @@ struct FmhaFwdKernel
               bool s_randval,
               const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
     {
-        MakeKargs(q_ptr,
-                  k_ptr,
-                  v_ptr,
-                  bias_ptr,
-                  rand_val_ptr,
-                  lse_ptr,
-                  o_ptr,
-                  seqlen_q,
-                  seqlen_k,
-                  hdim_q,
-                  hdim_v,
-                  num_head_q,
-                  nhead_ratio_qk,
-                  scale_s,
-                  scale_p,
-                  scale_o,
-                  stride_q,
-                  stride_k,
-                  stride_v,
-                  stride_bias,
-                  stride_randval,
-                  stride_o,
-                  nhead_stride_q,
-                  nhead_stride_k,
-                  nhead_stride_v,
-                  nhead_stride_bias,
-                  nhead_stride_randval,
-                  nhead_stride_lse,
-                  nhead_stride_o,
-                  batch_stride_q,
-                  batch_stride_k,
-                  batch_stride_v,
-                  batch_stride_bias,
-                  batch_stride_randval,
-                  batch_stride_lse,
-                  batch_stride_o,
-                  window_size_left,
-                  window_size_right,
-                  mask_type,
-                  p_drop,
-                  s_randval,
-                  std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+        return MakeKargsImpl(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            rand_val_ptr,
+            lse_ptr,
+            o_ptr,
+            seqlen_q,
+            seqlen_k,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale_s,
+            scale_p,
+            scale_o,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_o,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_lse,
+            nhead_stride_o,
+            batch_stride_q,
+            batch_stride_k,
+            batch_stride_v,
+            batch_stride_bias,
+            batch_stride_randval,
+            batch_stride_lse,
+            batch_stride_o,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            s_randval,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = !kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
@@ -533,91 +534,92 @@ struct FmhaFwdKernel
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
-              const std::tuple<void*, void*>& drop_seed_offset)
+              const std::tuple<const void*, const void*>& drop_seed_offset)
     {
-        MakeKargs(q_ptr,
-                  k_ptr,
-                  v_ptr,
-                  bias_ptr,
-                  rand_val_ptr,
-                  lse_ptr,
-                  o_ptr,
-                  seqlen_q,
-                  seqlen_k,
-                  hdim_q,
-                  hdim_v,
-                  num_head_q,
-                  nhead_ratio_qk,
-                  scale_s,
-                  scale_p,
-                  scale_o,
-                  stride_q,
-                  stride_k,
-                  stride_v,
-                  stride_bias,
-                  stride_randval,
-                  stride_o,
-                  nhead_stride_q,
-                  nhead_stride_k,
-                  nhead_stride_v,
-                  nhead_stride_bias,
-                  nhead_stride_randval,
-                  nhead_stride_lse,
-                  nhead_stride_o,
-                  batch_stride_q,
-                  batch_stride_k,
-                  batch_stride_v,
-                  batch_stride_bias,
-                  batch_stride_randval,
-                  batch_stride_lse,
-                  batch_stride_o,
-                  window_size_left,
-                  window_size_right,
-                  mask_type,
-                  p_drop,
-                  s_randval,
-                  std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+        return MakeKargsImpl(
+            q_ptr,
+            k_ptr,
+            v_ptr,
+            bias_ptr,
+            rand_val_ptr,
+            lse_ptr,
+            o_ptr,
+            seqlen_q,
+            seqlen_k,
+            hdim_q,
+            hdim_v,
+            num_head_q,
+            nhead_ratio_qk,
+            scale_s,
+            scale_p,
+            scale_o,
+            stride_q,
+            stride_k,
+            stride_v,
+            stride_bias,
+            stride_randval,
+            stride_o,
+            nhead_stride_q,
+            nhead_stride_k,
+            nhead_stride_v,
+            nhead_stride_bias,
+            nhead_stride_randval,
+            nhead_stride_lse,
+            nhead_stride_o,
+            batch_stride_q,
+            batch_stride_k,
+            batch_stride_v,
+            batch_stride_bias,
+            batch_stride_randval,
+            batch_stride_lse,
+            batch_stride_o,
+            window_size_left,
+            window_size_right,
+            mask_type,
+            p_drop,
+            s_randval,
+            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
     }
 
     template <bool Cond = kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              void* rand_val_ptr,
-              void* lse_ptr,
-              void* o_ptr,
-              const void* seqstart_q_ptr,
-              const void* seqstart_k_ptr,
-              const void* seqlen_k_ptr,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale_s,
-              float scale_p,
-              float scale_o,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_o,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_lse,
-              ck_tile::index_t nhead_stride_o,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              bool s_randval,
-              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                  drop_seed_offset)
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  void* rand_val_ptr,
+                  void* lse_ptr,
+                  void* o_ptr,
+                  const void* seqstart_q_ptr,
+                  const void* seqstart_k_ptr,
+                  const void* seqlen_k_ptr,
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  float scale_s,
+                  float scale_p,
+                  float scale_o,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_randval,
+                  ck_tile::index_t stride_o,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_randval,
+                  ck_tile::index_t nhead_stride_lse,
+                  ck_tile::index_t nhead_stride_o,
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type,
+                  float p_drop,
+                  bool s_randval,
+                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                      drop_seed_offset)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -702,9 +704,9 @@ struct FmhaFwdKernel
         return kargs;
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
@@ -742,7 +744,7 @@ struct FmhaFwdKernel
               bool s_randval,
               const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
     {
-        return MakeKargs(
+        return MakeKargsImpl(
             q_ptr,
             k_ptr,
             v_ptr,
@@ -781,9 +783,9 @@ struct FmhaFwdKernel
             std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
     }
 
-    // std::variant can't take in a list initializer, overload for backward compatibility
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
     template <bool Cond = kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
@@ -819,9 +821,9 @@ struct FmhaFwdKernel
               ck_tile::index_t mask_type,
               float p_drop,
               bool s_randval,
-              const std::tuple<void*, void*>& drop_seed_offset)
+              const std::tuple<const void*, const void*>& drop_seed_offset)
     {
-        return MakeKargs(
+        return MakeKargsImpl(
             q_ptr,
             k_ptr,
             v_ptr,
@@ -860,15 +862,15 @@ struct FmhaFwdKernel
             std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
     }
 
-    __host__ static constexpr auto GridSize(ck_tile::index_t batch_size_,
-                                            ck_tile::index_t nhead_,
-                                            ck_tile::index_t seqlen_q_,
-                                            ck_tile::index_t hdim_v_)
+    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_,
+                                                ck_tile::index_t nhead_,
+                                                ck_tile::index_t seqlen_q_,
+                                                ck_tile::index_t hdim_v_)
     {
         return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
     }
 
-    __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
 
     CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
     {
-- 
GitLab


From 440e28b08fa0f503c229f5787be4f775ad20484c Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Tue, 26 Nov 2024 11:14:56 +0800
Subject: [PATCH 078/153] [CK_TILE] fused-moe first version (#1634)

* moe pipeline

* update code

* compile OK

* update

* update cpu reference

* update pipeline_gemm0

* compiler ok

* update pipeline

* rename to ex pipeline

* block-asm

* update

* update

* update first gemm ok

* compute correct

* update file structure

* update README

* update

* update

* update code

* update API

* return unsupport case

* add comment

* update readme

* update

* uncomment

* update

* fix build err

---------

Co-authored-by: valarLip <340077269@qq.com>
---
 .../alternative_impl/matrix_core_swizzle.cpp  |   4 +-
 .../matrix_core_swizzle_kernel.hpp            |  12 +-
 example/ck_tile/06_permute/permute.cpp        |   2 +-
 .../13_moe_sorting/moe_sorting_api.hpp        |   2 +-
 example/ck_tile/15_fused_moe/CMakeLists.txt   |  19 +
 example/ck_tile/15_fused_moe/README.md        |  69 ++
 example/ck_tile/15_fused_moe/fused_moe.hpp    |  52 ++
 .../ck_tile/15_fused_moe/fused_moegemm.hpp    |  84 ++
 .../ck_tile/15_fused_moe/fused_moesorting.hpp |  20 +
 .../15_fused_moe/instances/fused_moe_api.cpp  |  80 ++
 .../instances/fused_moegemm_api.cpp           |  33 +
 .../instances/fused_moegemm_api_internal.hpp  |  60 ++
 .../instances/fused_moegemm_api_traits.hpp    |  53 ++
 .../instances/fused_moegemm_bf16_m32.cpp      |  14 +
 .../instances/fused_moegemm_fp16_m32.cpp      |  14 +
 .../instances/fused_moesorting_api.cpp        |  73 ++
 example/ck_tile/15_fused_moe/main.cpp         | 603 +++++++++++++
 example/ck_tile/15_fused_moe/misc/moe-0.png   | Bin 0 -> 76830 bytes
 example/ck_tile/15_fused_moe/misc/moe-1.png   | Bin 0 -> 92535 bytes
 example/ck_tile/15_fused_moe/misc/moe-2.png   | Bin 0 -> 126766 bytes
 example/ck_tile/15_fused_moe/misc/moe-3.png   | Bin 0 -> 18655 bytes
 example/ck_tile/CMakeLists.txt                |   2 +
 include/ck_tile/core.hpp                      |   2 +
 .../core/arch/amd_buffer_addressing.hpp       | 103 +++
 include/ck_tile/core/arch/arch.hpp            |  18 +
 include/ck_tile/core/arch/utility.hpp         |  24 +
 include/ck_tile/core/tensor/buffer_view.hpp   |  86 +-
 include/ck_tile/core/tensor/load_tile.hpp     |  54 +-
 .../core/tensor/static_distributed_tensor.hpp |  26 +
 include/ck_tile/core/tensor/tensor_view.hpp   |  42 +
 include/ck_tile/core/tensor/tile_window.hpp   |  74 +-
 .../core/tensor/tile_window_linear.hpp        | 159 +++-
 .../ck_tile/core/tensor/tile_window_utils.hpp |  54 ++
 include/ck_tile/core/tensor/update_tile.hpp   |  56 +-
 .../ck_tile/core/utility/static_counter.hpp   | 116 +++
 include/ck_tile/host.hpp                      |   2 +
 include/ck_tile/host/device_memory.hpp        |  35 +
 include/ck_tile/host/fill.hpp                 | 113 ++-
 include/ck_tile/host/host_tensor.hpp          | 121 ++-
 include/ck_tile/host/joinable_thread.hpp      |  27 +
 .../host/reference/reference_fused_moe.hpp    | 196 +++++
 .../host/reference/reference_permute.hpp      |  23 +-
 .../unary_element_wise_operation.hpp          |  99 +++
 include/ck_tile/ops/flatmm.hpp                |  10 +
 .../flatmm_32x512x128_1x4x1_16x16x32.hpp      | 615 +++++++++++++
 .../flatmm_sn_32x128x512_1x4x1_16x16x32.hpp   | 562 ++++++++++++
 .../ops/flatmm/block/flatmm_uk_config.hpp     |  10 +
 include/ck_tile/ops/flatmm/block/uk/README.md |   1 +
 ...m_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc | 613 +++++++++++++
 ...atmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc | 516 +++++++++++
 .../block_fmha_pipeline_qr_ks_vs_async.hpp    |  19 +-
 include/ck_tile/ops/fused_moe.hpp             |   8 +
 .../fused_moe/kernel/fused_moegemm_kernel.hpp | 421 +++++++++
 .../fused_moe/kernel/fused_moegemm_shape.hpp  | 125 +++
 .../kernel/fused_moegemm_tile_partitioner.hpp |  33 +
 .../fused_moegemm_pipeline_flatmm_ex.hpp      | 651 ++++++++++++++
 .../fused_moegemm_pipeline_flatmm_policy.hpp  | 831 ++++++++++++++++++
 .../fused_moegemm_pipeline_flatmm_uk.hpp      | 354 ++++++++
 .../fused_moegemm_pipeline_problem.hpp        |  46 +
 .../pipeline/fused_moegemm_traits.hpp         |  48 +
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 130 +--
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    | 170 +++-
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 457 +++++++---
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |  58 +-
 .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp  |  61 +-
 include/ck_tile/ops/moe_sorting.hpp           |  11 -
 66 files changed, 8067 insertions(+), 309 deletions(-)
 create mode 100644 example/ck_tile/15_fused_moe/CMakeLists.txt
 create mode 100644 example/ck_tile/15_fused_moe/README.md
 create mode 100644 example/ck_tile/15_fused_moe/fused_moe.hpp
 create mode 100644 example/ck_tile/15_fused_moe/fused_moegemm.hpp
 create mode 100644 example/ck_tile/15_fused_moe/fused_moesorting.hpp
 create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
 create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp
 create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
 create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp
 create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp
 create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp
 create mode 100644 example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
 create mode 100644 example/ck_tile/15_fused_moe/main.cpp
 create mode 100644 example/ck_tile/15_fused_moe/misc/moe-0.png
 create mode 100644 example/ck_tile/15_fused_moe/misc/moe-1.png
 create mode 100644 example/ck_tile/15_fused_moe/misc/moe-2.png
 create mode 100644 example/ck_tile/15_fused_moe/misc/moe-3.png
 create mode 100644 include/ck_tile/core/tensor/tile_window_utils.hpp
 create mode 100644 include/ck_tile/core/utility/static_counter.hpp
 create mode 100644 include/ck_tile/host/joinable_thread.hpp
 create mode 100644 include/ck_tile/host/reference/reference_fused_moe.hpp
 create mode 100644 include/ck_tile/ops/flatmm.hpp
 create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
 create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp
 create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp
 create mode 100644 include/ck_tile/ops/flatmm/block/uk/README.md
 create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
 create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
 create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp
 create mode 100644 include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
 delete mode 100644 include/ck_tile/ops/moe_sorting.hpp

diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
index 93c662a28..e5ded0ef3 100644
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
@@ -40,7 +40,7 @@ float matrix_core_swizzle(matrix_core_swizzle_traits t,
             else if(t.permute.compare("0,1,3,4,2,5") == 0)
             {
                 constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv;
+                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
                 using Kernel =
                     matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
 
@@ -83,7 +83,7 @@ float matrix_core_swizzle(matrix_core_swizzle_traits t,
             else if(t.permute.compare("0,1,3,4,2,5") == 0)
             {
                 constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_nr_kr_kw_nw_kv;
+                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
                 using Kernel =
                     matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
 
diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
index 60ac103ec..28f4c452b 100644
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -42,8 +42,8 @@ enum class matrix_core_permute_style
 {
     permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6
     permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6
-    permute_b_nr_kr_kw_nw_kv    = 2, // 0,1,3,4,2,5
-    permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv,
+    b_nr_kr_kw_nw_kv            = 2, // 0,1,3,4,2,5
+    b_nr_kr_waveflatten         = b_nr_kr_kw_nw_kv,
 };
 
 // assume this is B matrix, originally we have batch*n*k
@@ -203,7 +203,7 @@ struct matrix_core_swizzle_kernel
             else
             {
                 // clang-format off
-                // permute_b_nr_kr_kw_nw_kv or permute_b_nr_kr_waveflatten
+                // b_nr_kr_kw_nw_kv or b_nr_kr_waveflatten
                 constexpr index_t Kv = Alignment;
                 constexpr index_t Nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
                 constexpr index_t Kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
@@ -332,7 +332,7 @@ struct matrix_core_swizzle_kernel
                         make_tuple(sequence<0>{}, sequence<1>{}));
                     return tmp_1;
 #else
-                    // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv,
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
                     constexpr index_t kv = Alignment;
                     constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
                     constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
@@ -376,13 +376,13 @@ struct matrix_core_swizzle_kernel
                 else
                 {
 #if MERGE_2D_013425
-                    // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
                     return make_tile_window(dst_view,
                                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
                                             {i_n * NPerBlock, i_k * KPerBlock},
                                             get_dst_dist());
 #else
-                    // permute_b_nr_kr_waveflatten = permute_b_nr_kr_kw_nw_kv
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
                     constexpr index_t kv = Alignment;
                     constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
                     constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp
index af95b64e6..477ae370b 100644
--- a/example/ck_tile/06_permute/permute.cpp
+++ b/example/ck_tile/06_permute/permute.cpp
@@ -264,7 +264,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     {
         if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5"))
         {
-            // permute_b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
+            // b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
             matrix_core_swizzle_traits t;
             t.data_type = data_type;
             t.permute   = arg_parser.get_str("perm");
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
index 91b54932c..0cb393f7d 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -5,7 +5,7 @@
 #include <string>
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
-#include "ck_tile/ops/moe_sorting.hpp"
+#include "ck_tile/ops/fused_moe.hpp"
 
 struct moe_sorting_trait
 {
diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt
new file mode 100644
index 000000000..a716eef19
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message("adding ${TILE_EXAPMLE_FUSED_MOE}")
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
+target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})
+
+set(TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a
+list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta
+# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS  -mllvm -greedy-reverse-local-assignment=1)
+# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+
+target_compile_options(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS})
diff --git a/example/ck_tile/15_fused_moe/README.md b/example/ck_tile/15_fused_moe/README.md
new file mode 100644
index 000000000..dd566c166
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/README.md
@@ -0,0 +1,69 @@
+# fused-moe
+Implementing the fused-moe block operator using ck-tile. This is a scatter/gather-group-gemm based solution, similiar to that of [vllm moe](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py), but we introduce more kernel fusion to boost performance
+![](misc/moe-0.png)
+
+The benifit of this fused-moe:
+* 1.5~2x perf boost compared with current vllm solution
+* zero workspace to reduce memory footprint
+* much less kernel instance, easy to maintain
+
+# Implementation and feature support
+## moe-sorting
+this is a common pre-process step before the actual moe-gemm. The purpose is to transform the moe loop over from token-by-token to expert-by-expert, make sure very workgroup is working for a single expert (B matrix). Besides, we extend this op to do the zeroing of the output buffer(to be used for reduce buffer with atomic)
+
+## moe-gemm
+`moe-gemm` is a group-gemm based back-to-back gemm, where the row-id of input token comes from another buffer. Naive understanding of fused-moe is from token-by-token view as below picture:
+![](misc/moe-1.png)
+After `moe-sorting`, we can view this algorithm as expert-by-expert, as below:
+![](misc/moe-2.png)
+
+## optimization
+summary of the key design of this fused-moe operator:
+* fuse 2 group-gemm + activation + `topk-weight` multiply into single kernel, using atomic for 2nd gemm accumualation
+* fuse buffer-zeroing in `moe-sorgin`, user no longer need call extra torch.zero() for the out buffer
+* fused scatter-gather for row index(same as vllm)
+* pre-shuffle B matric(weight) to maximize memory throughput. input(activation) keep original layout `[batch, hidden]`.
+* extrem optimized pipeline using block-inline-asm(we call it `micro-kernel` or `uk`), while not breaking the *composable* design of ck
+
+## 
+```
+// [indexing implementation-1]
+// using M_a as constexpr block_size to partition all tokens into different slices
+// each slice map to one expert, and one expert can have multiple slices
+// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number)
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+//
+// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
+// * this could be larger than actual, since actual tokens are on GPU
+//
+// sorted_token_ids_ptr   : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5]
+//                          |-  exp-0  -|-  exp-1  -|-  exp-2  -|-      exp-3          -|-  exp-4 -|-  exp-5  -|
+// sorted_weight_ptr      : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o]
+//
+// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
+//
+// sorted_expert_ids_ptr  : [0, 1, 2, 3, 3, 4, 5]
+// * length is (max_num_tokens_padded + block_size - 1) / block_size
+//
+// num_tokens_post_padded_ptr : [28]
+// num_sorted_tiles_ptr : [7]
+//
+// * different from vLLM
+//   1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id
+//   2）need sorted_weight_ptr
+//   3) use num_sorted_tiles_ptr, already divided by M_a
+//
+// * below used for indexing
+//  1) sorted_token_ids_ptr [max_num_tokens_padded]
+//  2) sorted_weight_ptr
+//  3) sorted_expert_ids_ptr
+//  4）num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one)
+//
+//   max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1)
+```
\ No newline at end of file
diff --git a/example/ck_tile/15_fused_moe/fused_moe.hpp b/example/ck_tile/15_fused_moe/fused_moe.hpp
new file mode 100644
index 000000000..6bd7688d8
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/fused_moe.hpp
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "fused_moesorting.hpp"
+#include "fused_moegemm.hpp"
+
+struct fused_moe_args
+{
+    const void* a_ptr;              // [m, k], input token
+    const void* a_scale_ptr;        // [m, 1], token scale
+    const void* g_ptr;              // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w])
+    const void* d_ptr;              // [e, n, k], pre-shuffle([e, nr, kr, w])
+    const void* g_scale_ptr;        // [e, 1, n], gate(up) scale
+    const void* d_scale_ptr;        // [e, 1, k], down scale
+    const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input
+    void* o_ptr;                    // [m, k], output token (no need to do zeroing)
+
+    const void* topk_ids_ptr;    // [tokens, topk]
+    const void* topk_weight_ptr; // [tokens, topk]
+    void* sorted_token_ids_ptr;  // [max_num_tokens_padded]
+    void* sorted_weight_ptr;     // [max_num_tokens_padded]
+    void* sorted_expert_ids_ptr; // [(max_num_tokens_padded + block_size - 1) / block_size]
+    void* num_sorted_tiles_ptr;  // [1]
+
+    ck_tile::index_t block_m;           // block_m, used to devide the input
+    ck_tile::index_t hidden_size;       // k
+    ck_tile::index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2
+    ck_tile::index_t num_tokens;        // input number of tokens for current iteration
+    ck_tile::index_t num_experts;       // number of groups
+    ck_tile::index_t topk;              // need this?
+
+    ck_tile::index_t stride_token; // for input/output, stride for each row, should >= hidden_size
+};
+
+// This is the public API, will be generated by script
+struct fused_moe_traits
+{
+    std::string prec_i;  // input precision
+    std::string prec_w;  // weight precision
+    std::string prec_o;  // output precision
+    std::string prec_st; // token scale data type
+    std::string prec_sw; // weight scale data type
+    std::string prec_sq; // smooth quant scale
+    std::string prec_kw; // topk-weight data type
+    int block_m;
+    int gate_only;
+    int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+};
+
+float fused_moe(fused_moe_traits, fused_moe_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/15_fused_moe/fused_moegemm.hpp b/example/ck_tile/15_fused_moe/fused_moegemm.hpp
new file mode 100644
index 000000000..b8e51475a
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/fused_moegemm.hpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/fused_moe.hpp"
+#include <string>
+
+// this is only a convenient structure for creating an example
+// this is not part of the host API
+template <typename I, typename W, typename O, typename ST, typename SW, typename SQ, typename KW>
+struct FusedMoeGemmTypeConfig;
+
+template <typename ST, typename SW, typename SQ, typename KW>
+struct FusedMoeGemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t, ST, SW, SQ, KW>
+{
+    using ADataType            = ck_tile::bf16_t;
+    using GDataType            = ck_tile::bf16_t;
+    using DDataType            = ck_tile::bf16_t;
+    using AccDataType          = float;
+    using ODataType            = ck_tile::bf16_t;
+    using AScaleDataType       = ck_tile::remove_cvref_t<ST>;
+    using GScaleDataType       = ck_tile::remove_cvref_t<SW>;
+    using DScaleDataType       = ck_tile::remove_cvref_t<SW>;
+    using YSmoothScaleDataType = ck_tile::remove_cvref_t<SQ>;
+    using TopkWeightDataType   = ck_tile::remove_cvref_t<KW>;
+    using IndexDataType        = ck_tile::index_t;
+};
+
+template <typename ST, typename SW, typename SQ, typename KW>
+struct FusedMoeGemmTypeConfig<ck_tile::fp16_t, ck_tile::fp16_t, ck_tile::fp16_t, ST, SW, SQ, KW>
+{
+    using ADataType            = ck_tile::fp16_t;
+    using GDataType            = ck_tile::fp16_t;
+    using DDataType            = ck_tile::fp16_t;
+    using AccDataType          = float;
+    using ODataType            = ck_tile::fp16_t;
+    using AScaleDataType       = ck_tile::remove_cvref_t<ST>;
+    using GScaleDataType       = ck_tile::remove_cvref_t<SW>;
+    using DScaleDataType       = ck_tile::remove_cvref_t<SW>;
+    using YSmoothScaleDataType = ck_tile::remove_cvref_t<SQ>;
+    using TopkWeightDataType   = ck_tile::remove_cvref_t<KW>;
+    using IndexDataType        = ck_tile::index_t;
+};
+
+template <typename ST, typename SW, typename SQ, typename KW>
+struct FusedMoeGemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, ck_tile::bf16_t, ST, SW, SQ, KW>
+{
+    using ADataType            = ck_tile::int8_t;
+    using GDataType            = ck_tile::int8_t;
+    using DDataType            = ck_tile::int8_t;
+    using AccDataType          = int32_t;
+    using ODataType            = ck_tile::bf16_t;
+    using AScaleDataType       = ck_tile::remove_cvref_t<ST>;
+    using GScaleDataType       = ck_tile::remove_cvref_t<SW>;
+    using DScaleDataType       = ck_tile::remove_cvref_t<SW>;
+    using YSmoothScaleDataType = ck_tile::remove_cvref_t<SQ>;
+    using TopkWeightDataType   = ck_tile::remove_cvref_t<KW>;
+    using IndexDataType        = ck_tile::index_t;
+};
+
+// runtime args
+struct fused_moegemm_args : public ck_tile::FusedMoeGemmHostArgs
+{
+};
+
+// This is the public API, will be generated by script
+struct fused_moegemm_traits
+{
+    std::string prec_i;  // input precision
+    std::string prec_w;  // weight precision
+    std::string prec_o;  // output precision
+    std::string prec_st; // token scale data type
+    std::string prec_sw; // weight scale data type
+    std::string prec_sq; // smooth quant scale
+    std::string prec_kw; // topk-weight data type
+    int block_m;
+    int gate_only;
+    int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+};
+
+float fused_moegemm(fused_moegemm_traits, fused_moegemm_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/15_fused_moe/fused_moesorting.hpp b/example/ck_tile/15_fused_moe/fused_moesorting.hpp
new file mode 100644
index 000000000..57dace9b4
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/fused_moesorting.hpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/fused_moe.hpp"
+
+struct fused_moesorting_trait
+{
+    std::string index_type;
+    std::string weight_type; // currently always float
+};
+
+struct fused_moesorting_args : public ck_tile::MoeSortingHostArgs
+{
+};
+
+float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s);
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
new file mode 100644
index 000000000..bfc0ce409
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "fused_moe.hpp"
+
+float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_config& s)
+{
+    auto s_sub = ck_tile::stream_config{s.stream_id_, false, s.log_level_, 0, 1};
+
+    auto o_data_bytes = [&]() {
+        if(t.prec_o == "fp32")
+            return 4;
+        else if(t.prec_o == "fp16" || t.prec_o == "bf16")
+            return 2;
+        else if(t.prec_o == "int8" || t.prec_o == "fp8")
+            return 1;
+        return 1;
+    }();
+
+    auto t0 = fused_moesorting_trait{"int32", "fp32"};
+    auto a0 = fused_moesorting_args{
+        a.topk_ids_ptr,                              // const void* p_topk_ids;
+        a.topk_weight_ptr,                           // const void* p_weights;
+        a.sorted_token_ids_ptr,                      // void* p_sorted_token_ids;
+        a.sorted_weight_ptr,                         // void* p_sorted_weights;
+        a.sorted_expert_ids_ptr,                     // void* p_sorted_expert_ids;
+        a.num_sorted_tiles_ptr,                      // void* p_total_tokens_post_pad;
+        a.o_ptr,                                     // void* p_moe_buf;
+        a.num_tokens,                                // index_t tokens;
+        a.block_m,                                   // index_t unit_size;
+        a.num_experts,                               // index_t num_experts;
+        a.topk,                                      // index_t topk;
+        a.num_tokens * a.stride_token * o_data_bytes // index_t moe_buf_bytes;
+    };
+
+    auto t1 = fused_moegemm_traits{t.prec_i,
+                                   t.prec_w,
+                                   t.prec_o,
+                                   t.prec_st,
+                                   t.prec_sw,
+                                   t.prec_sq,
+                                   t.prec_kw,
+                                   t.block_m,
+                                   t.gate_only,
+                                   t.fused_quant};
+    auto a1 = fused_moegemm_args{
+        a.a_ptr,                 // const void* a_ptr;
+        a.a_scale_ptr,           // const void* a_scale_ptr;
+        a.g_ptr,                 // const void* g_ptr;
+        a.d_ptr,                 // const void* d_ptr;
+        a.g_scale_ptr,           // const void* g_scale_ptr;
+        a.d_scale_ptr,           // const void* d_scale_ptr;
+        a.y_smooth_scale_ptr,    // const void* y_smooth_scale_ptr;
+        a.o_ptr,                 // void* o_ptr;
+        a.sorted_token_ids_ptr,  // const void* sorted_token_ids_ptr;
+        a.sorted_weight_ptr,     // const void* sorted_weight_ptr;
+        a.sorted_expert_ids_ptr, // const void* sorted_expert_ids_ptr;
+        a.num_sorted_tiles_ptr,  // const void* num_sorted_tiles_ptr;
+        a.hidden_size,           // index_t hidden_size;
+        a.intermediate_size,     // index_t intermediate_size;
+        a.num_tokens,            // index_t num_tokens;
+        a.num_experts,           // index_t num_experts;
+        a.topk,                  // index_t topk;
+        a.stride_token           // index_t stride_token;
+    };
+
+    float r0 = -1;
+    float r1 = -1;
+
+    float r = ck_tile::launch_kernel(
+        s,
+        [=, &r0](const ck_tile::stream_config&) { r0 = fused_moesorting(t0, a0, s_sub); },
+        [=, &r1](const ck_tile::stream_config&) { r1 = fused_moegemm(t1, a1, s_sub); });
+
+    // keep unsupported case return negative
+    if(r0 < 0 || r1 < 0)
+        return -1;
+
+    return r;
+}
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp
new file mode 100644
index 000000000..c1a4c495c
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "fused_moegemm.hpp"
+#include "fused_moegemm_api_traits.hpp"
+
+// Note: this internal API only declare, not define here, otherwise will block `make -j`
+template <typename Traits_>
+float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a);
+
+template <ck_tile::index_t... Is>
+using S = ck_tile::sequence<Is...>;
+
+float fused_moegemm(fused_moegemm_traits t, fused_moegemm_args a, const ck_tile::stream_config& s)
+{
+    // clang-format off
+    float r = -1;
+    if(t.prec_i == "bf16" && t.prec_w == "bf16" && t.prec_o == "bf16" && t.prec_st == "fp32" &&
+       t.prec_sw == "fp32" && t.prec_sq == "fp32" && t.prec_kw == "fp32" && t.block_m == 32 && t.gate_only == 1)
+    {
+        using t_ = fmoe_<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t, float, float, float, float, S<32, 512, 128, 128>, S<1, 4, 1>, S<16, 16, 32>, 1, 0>;
+        r = fused_moegemm_<t_>(s, a);
+    }
+    else if(t.prec_i == "fp16" && t.prec_w == "fp16" && t.prec_o == "fp16" && t.prec_st == "fp32" &&
+       t.prec_sw == "fp32" && t.prec_sq == "fp32" && t.prec_kw == "fp32" && t.block_m == 32 && t.gate_only == 1)
+    {
+        using t_ = fmoe_<ck_tile::fp16_t, ck_tile::fp16_t, ck_tile::fp16_t, float, float, float, float, S<32, 512, 128, 128>, S<1, 4, 1>, S<16, 16, 32>, 1, 0>;
+        r = fused_moegemm_<t_>(s, a);
+    }
+    // clang-format on
+    return r;
+}
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
new file mode 100644
index 000000000..5872179ef
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "fused_moegemm_api_traits.hpp"
+#include "ck_tile/ops/fused_moe.hpp"
+#include <iostream>
+
+template <ck_tile::index_t... Is>
+using S = ck_tile::sequence<Is...>;
+
+// do not the define of this tepmlate function inside the _api.cpp, otherwise will block make -j
+template <typename Ts_>
+float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
+{
+    using f_traits = ck_tile::FusedMoeGemmTraits<Ts_::GateOnly, Ts_::FusedQuant == 1, 1 /*atomic*/>;
+    using f_shape  = ck_tile::FusedMoeGemmShape<typename Ts_::BlockTile_0,
+                                               typename Ts_::WarpPerBlock_0,
+                                               typename Ts_::WarpTile_0,
+                                               typename Ts_::BlockTile_1,
+                                               typename Ts_::WarpPerBlock_0,
+                                               typename Ts_::WarpTile_0>;
+    using f_problem =
+        ck_tile::FusedMoeGemmPipelineProblem<typename Ts_::ADataType,
+                                             typename Ts_::GDataType,
+                                             typename Ts_::DDataType,
+                                             typename Ts_::AccDataType,
+                                             typename Ts_::ODataType,
+                                             typename Ts_::AScaleDataType,
+                                             typename Ts_::GScaleDataType,
+                                             typename Ts_::DScaleDataType,
+                                             typename Ts_::YSmoothScaleDataType,
+                                             typename Ts_::TopkWeightDataType,
+                                             typename Ts_::IndexDataType,
+                                             ck_tile::element_wise::FastGeluAsm, // TODO: hardcoded
+                                             f_shape,
+                                             f_traits>;
+
+    // using f_pipeline    = ck_tile::FusedMoeGemmPipeline_FlatmmEx<f_problem>;
+    using f_pipeline    = ck_tile::FusedMoeGemmPipeline_FlatmmUk<f_problem>;
+    using f_partitioner = ck_tile::FusedMoeGemmTilePartitioner_Linear<f_shape>;
+    using f_kernel      = ck_tile::FusedMoeGemmKernel<f_partitioner, f_pipeline, void>;
+
+    const dim3 grids                       = f_kernel::GridSize(a);
+    constexpr dim3 blocks                  = f_kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    static int printed = 0;
+
+    auto kargs = f_kernel::MakeKargs(a);
+    if(s.log_level_ > 0 && printed == 0)
+    {
+        std::cout << ", " << f_kernel::GetName() << std::flush;
+        printed = 1;
+    }
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(f_kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp
new file mode 100644
index 000000000..cc476685d
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_traits.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ck_tile/core.hpp>
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename I,
+          typename W,
+          typename O,
+          typename ST,
+          typename SW,
+          typename SQ,
+          typename KW,
+          typename BlockTIle_, // seq<b_token, b_interm, b_hidden, b_down>
+          typename WarpPerBlock_,
+          typename WarpTile_, // seq<*,*,*>, used to select mfma
+          ck_tile::index_t GateOnly_   = 0,
+          ck_tile::index_t FusedQuant_ = 0>
+struct fmoe_ // traits, ugly name, only used for internal
+{
+    using TypeConfig = FusedMoeGemmTypeConfig<I, W, O, ST, SW, SQ, KW>;
+
+    using ADataType            = ck_tile::remove_cvref_t<typename TypeConfig::ADataType>;
+    using GDataType            = ck_tile::remove_cvref_t<typename TypeConfig::GDataType>;
+    using DDataType            = ck_tile::remove_cvref_t<typename TypeConfig::DDataType>;
+    using AccDataType          = ck_tile::remove_cvref_t<typename TypeConfig::AccDataType>;
+    using ODataType            = ck_tile::remove_cvref_t<typename TypeConfig::ODataType>;
+    using AScaleDataType       = ck_tile::remove_cvref_t<typename TypeConfig::AScaleDataType>;
+    using GScaleDataType       = ck_tile::remove_cvref_t<typename TypeConfig::GScaleDataType>;
+    using DScaleDataType       = ck_tile::remove_cvref_t<typename TypeConfig::DScaleDataType>;
+    using YSmoothScaleDataType = ck_tile::remove_cvref_t<typename TypeConfig::YSmoothScaleDataType>;
+    using TopkWeightDataType   = ck_tile::remove_cvref_t<typename TypeConfig::TopkWeightDataType>;
+    using IndexDataType        = ck_tile::remove_cvref_t<typename TypeConfig::IndexDataType>;
+
+    static constexpr ck_tile::index_t BT_ = BlockTIle_::at(ck_tile::number<0>{}); // block token
+    static constexpr ck_tile::index_t BI_ =
+        BlockTIle_::at(ck_tile::number<1>{}); // block intermediate
+    static constexpr ck_tile::index_t BH_ = BlockTIle_::at(ck_tile::number<2>{}); // block hidden
+    static constexpr ck_tile::index_t BD_ = BlockTIle_::at(ck_tile::number<3>{}); // block down
+
+    using BlockTile_0    = ck_tile::sequence<BT_, BI_, BH_>;
+    using WarpPerBlock_0 = ck_tile::remove_cvref_t<WarpPerBlock_>;
+    using WarpTile_0     = ck_tile::remove_cvref_t<WarpTile_>;
+
+    using BlockTile_1    = ck_tile::sequence<BT_, BD_, BI_ / (GateOnly_ ? 1 : 2)>;
+    using WarpPerBlock_1 = ck_tile::remove_cvref_t<WarpPerBlock_>;
+    using WarpTile_1     = ck_tile::remove_cvref_t<WarpTile_>;
+
+    static constexpr ck_tile::index_t GateOnly   = GateOnly_;
+    static constexpr ck_tile::index_t FusedQuant = FusedQuant_;
+};
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp
new file mode 100644
index 000000000..93f9c7786
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_bf16_m32.cpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "fused_moegemm.hpp"
+#include "fused_moegemm_api_traits.hpp"
+#include "fused_moegemm_api_internal.hpp"
+
+// clang-format off
+template float fused_moegemm_<
+    fmoe_<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t, float, float, float, float, S<32, 512, 128, 128>, S<1, 4, 1>, S<16, 16, 32>, 1, 0>
+>(const ck_tile::stream_config& s, fused_moegemm_args a);
+
+// clang-format on
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp
new file mode 100644
index 000000000..b8a823e8e
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_fp16_m32.cpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "fused_moegemm.hpp"
+#include "fused_moegemm_api_traits.hpp"
+#include "fused_moegemm_api_internal.hpp"
+
+// clang-format off
+template float fused_moegemm_<
+    fmoe_<ck_tile::fp16_t, ck_tile::fp16_t, ck_tile::fp16_t, float, float, float, float, S<32, 512, 128, 128>, S<1, 4, 1>, S<16, 16, 32>, 1, 0>
+>(const ck_tile::stream_config& s, fused_moegemm_args a);
+
+// clang-format on
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
new file mode 100644
index 000000000..75aaf86b7
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "fused_moesorting.hpp"
+
+#define MOE_SORTING_DISPATCH(unroll_num_)                                                   \
+    constexpr ck_tile::index_t unroll_num = unroll_num_;                                    \
+    using ms_problem     = ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num>; \
+    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                           \
+    auto kargs           = kernel::MakeKargs(a);                                            \
+    const dim3 grids     = kernel::GridSize(a);                                             \
+    const dim3 blocks    = kernel::BlockSize(a);                                            \
+    const auto lds_bytes = kernel::GetSmemSize(a);                                          \
+    float ave_time       = ck_tile::launch_kernel(                                          \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));          \
+    return ave_time;
+
+float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s)
+{
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        if(a.num_experts > 127)
+        {
+            printf("lds size exceed, only support experts <127 \n");
+            return -1;
+        }
+        if(a.moe_buf_bytes % 16)
+        {
+            printf("buf set size %d unaligned, must be multiple of 16\n", a.moe_buf_bytes);
+            return -1;
+        }
+        using index_t              = ck_tile::index_t;
+        using ms_weight_type       = float;
+        index_t smem_io_unroll_num = ck_tile::integer_divide_ceil(a.tokens * a.topk, 64);
+        switch(smem_io_unroll_num)
+        {
+        case(1): {
+            MOE_SORTING_DISPATCH(1);
+        }
+        case(2): {
+            MOE_SORTING_DISPATCH(2);
+        }
+        case(3): {
+            MOE_SORTING_DISPATCH(3);
+        }
+        case(5): {
+            MOE_SORTING_DISPATCH(5);
+        }
+        case(6): {
+            MOE_SORTING_DISPATCH(6);
+        }
+        case(7): {
+            MOE_SORTING_DISPATCH(7);
+        }
+        case(8): {
+            MOE_SORTING_DISPATCH(8);
+        }
+        case(9): {
+            MOE_SORTING_DISPATCH(9);
+        }
+        case(10): {
+            MOE_SORTING_DISPATCH(10);
+        }
+        case(11): {
+            MOE_SORTING_DISPATCH(11);
+        }
+        default: {
+            MOE_SORTING_DISPATCH(4);
+        }
+        }
+    }
+    return -1;
+}
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
new file mode 100644
index 000000000..2f44f903e
--- /dev/null
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -0,0 +1,603 @@
+#include <algorithm>
+#include <cstring>
+#include <unordered_set>
+#include <vector>
+#include <set>
+
+#include "ck_tile/host.hpp"
+#include "fused_moe.hpp"
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+// mfma_type, 0:32x32, 1:16x16
+// TODO: padding?
+template <typename T>
+auto shuffle_moe_weight(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type = 0)
+{
+    assert(t.get_lengths().size() == 3);
+    int b_ = t.get_lengths()[0];
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[2];
+    if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({b_, n_ / 32, 32, k_ / 16, 2, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5});
+    }
+    else if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({b_, n_ / 16, 16, k_ / 32, 4, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({b_, n_ / 32, 32, k_ / 32, 2, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({b_, n_ / 16, 16, k_ / 64, 4, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 1, 3, 4, 2, 5});
+    }
+    return t;
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("t", "128", "num input tokens")
+        .insert("e", "32", "num of experts")
+        .insert("k", "5", "topk")
+        .insert("h", "8192", "hidden_size of this model")
+        .insert("i", "8192", "intermediate_size between 2 gemms of FFN")
+        .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size")
+        .insert("bm", "32", "blocking factor for sorted tokens")
+        .insert("tp", "8", "tensor parallel size")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec_i", "bf16", "input precision")
+        .insert("prec_w", "bf16", "weight precision")
+        .insert("prec_o", "bf16", "output precision")
+        .insert("prec_st", "auto", "token scale data type. auto will set to fp32")
+        .insert("prec_sw", "auto", "weight scale data type. auto will set to fp32")
+        .insert("prec_sq", "auto", "(dynamic) smooth quant data type. auto will set to fp32")
+        .insert("prec_kw", "auto", "topk-weight data type. auto will set to fp32")
+        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
+        .insert(
+            "gate_only", "1", "w0(gate/up) style, 0:gate+up will double interm size, 1:only gate")
+        .insert("api", "0", "benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm")
+        .insert("balance",
+                "0",
+                "if set to 1, will try balance the expert in topk-ids(convenient for testing)")
+        .insert("init",
+                "2",
+                "init method. 0:random stepped float(fast). 1: random uniform, 2:rand normalized"
+                "normalized(slow)")
+        .insert("seed", "11939", "seed used to do random")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// I:input-type, W:weight-type, O:output-type, ST:toke-scale-tpye, SW:weight-scale-type,
+// SQ:smooth-quant-type, KW:topk-weight-type
+template <typename I, typename W, typename O, typename ST, typename SW, typename SQ, typename KW>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t tokens            = arg_parser.get_int("t");
+    ck_tile::index_t experts           = arg_parser.get_int("e");
+    ck_tile::index_t topk              = arg_parser.get_int("k");
+    ck_tile::index_t hidden_size       = arg_parser.get_int("h");
+    ck_tile::index_t intermediate_size = arg_parser.get_int("i");
+    ck_tile::index_t stride            = arg_parser.get_int("stride");
+    ck_tile::index_t block_m           = arg_parser.get_int("bm");
+    if(stride < 0)
+        stride = hidden_size;
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_w  = arg_parser.get_str("prec_w");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_st = arg_parser.get_str("prec_st");
+    std::string prec_sw = arg_parser.get_str("prec_sw");
+    std::string prec_sq = arg_parser.get_str("prec_sq");
+    std::string prec_kw = arg_parser.get_str("prec_kw");
+    prec_st             = (prec_st == "auto") ? "fp32" : prec_st;
+    prec_sw             = (prec_sw == "auto") ? "fp32" : prec_sw;
+    prec_sq             = (prec_sq == "auto") ? "fp32" : prec_sq;
+    prec_kw             = (prec_kw == "auto") ? "fp32" : prec_kw;
+    int kname           = arg_parser.get_int("kname");
+    int do_validation   = arg_parser.get_int("v");
+    int warmup          = arg_parser.get_int("warmup");
+    int repeat          = arg_parser.get_int("repeat");
+    int fused_quant     = arg_parser.get_int("fquant");
+    int gate_only       = arg_parser.get_int("gate_only");
+    int api             = arg_parser.get_int("api");
+    int balance         = arg_parser.get_int("balance");
+    int tp              = arg_parser.get_int("tp");
+    int init            = arg_parser.get_int("init");
+    uint32_t seed       = arg_parser.get_uint32("seed");
+
+    // w0 (Gate+Up or Gate only, N size)
+    ck_tile::index_t shared_intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2) / tp;
+    // w1 (Down, N size)
+    ck_tile::index_t shared_intermediate_size_1 = intermediate_size / tp;
+
+    auto prec_str = [&]() {
+        auto base_str = prec_i;
+        if(prec_i != prec_w)
+            base_str += "x" + prec_w;
+        if(prec_i != prec_o)
+            base_str += "=" + prec_o;
+        if(fused_quant != 0)
+        {
+            base_str += std::string("(") + prec_st + "|" + prec_sw + "|" + prec_sq + ")";
+        }
+        return base_str;
+    }();
+    auto api_str = [&]() {
+        if(api == 0)
+            return std::string("fmoe");
+        else if(api == 1)
+            return std::string("moeg");
+        else if(api == 2)
+            return std::string("moes");
+        return std::string("");
+    }();
+
+    auto stride_str = [&]() {
+        if(stride == hidden_size)
+            return std::string("");
+        else
+            return std::string(", st:") + std::to_string(stride);
+    }();
+
+    std::cout << "[" << api_str << "|" << prec_str << "]"
+              << " t:" << tokens << ", e:" << experts << ", k:" << topk << stride_str
+              << ", hidden:" << hidden_size << ", interm:" << intermediate_size << ", tp:" << tp
+              << ", shrd_interm:" << shared_intermediate_size_0 << "|" << shared_intermediate_size_1
+              << ", go:" << gate_only << ", q:" << fused_quant << std::flush;
+
+    using TypeConfig           = FusedMoeGemmTypeConfig<I, W, O, ST, SW, SQ, KW>;
+    using ADataType            = typename TypeConfig::ADataType;
+    using GDataType            = typename TypeConfig::GDataType;
+    using DDataType            = typename TypeConfig::DDataType;
+    using AccDataType          = typename TypeConfig::AccDataType;
+    using ODataType            = typename TypeConfig::ODataType;
+    using AScaleDataType       = typename TypeConfig::AScaleDataType;
+    using GScaleDataType       = typename TypeConfig::GScaleDataType;
+    using DScaleDataType       = typename TypeConfig::DScaleDataType;
+    using YSmoothScaleDataType = typename TypeConfig::YSmoothScaleDataType;
+    using TopkWeightDataType   = typename TypeConfig::TopkWeightDataType;
+    using IndexDataType        = typename TypeConfig::IndexDataType;
+
+    // host verify
+    ck_tile::HostTensor<ADataType> a_host({tokens, hidden_size}, {stride, 1});
+    ck_tile::HostTensor<GDataType> g_host({experts, shared_intermediate_size_0, hidden_size});
+    ck_tile::HostTensor<DDataType> d_host({experts, hidden_size, shared_intermediate_size_1});
+    ck_tile::HostTensor<ODataType> o_host({tokens, hidden_size}, {stride, 1});
+    ck_tile::HostTensor<AScaleDataType> sa_host({tokens});
+    ck_tile::HostTensor<GScaleDataType> sg_host({shared_intermediate_size_0});
+    ck_tile::HostTensor<DScaleDataType> sd_host({shared_intermediate_size_1});
+    ck_tile::HostTensor<YSmoothScaleDataType> sy_host({shared_intermediate_size_1}); // smooth-quant
+    ck_tile::HostTensor<IndexDataType> topk_ids_host({tokens, topk});                // to be sort
+    ck_tile::HostTensor<TopkWeightDataType> topk_weight_host({tokens, topk});        // to be sort
+
+    int max_num_tokens_padded = topk * tokens + experts * block_m - topk;
+    ck_tile::HostTensor<IndexDataType> sorted_token_ids_host({max_num_tokens_padded});
+    ck_tile::HostTensor<TopkWeightDataType> sorted_weight_host({max_num_tokens_padded});
+    ck_tile::HostTensor<IndexDataType> sorted_expert_ids_host(
+        {(max_num_tokens_padded + block_m - 1) / block_m});
+    ck_tile::HostTensor<IndexDataType> num_sorted_tiles_host({1});
+
+    if(init == 0)
+    {
+        ck_tile::FillStepRange<ADataType>{-.5f, .5f, 0.01f}(a_host);
+        ck_tile::FillStepRange<GDataType>{-.5f, .5f, 0.01f}(g_host);
+        ck_tile::FillStepRange<DDataType, false>{.5f, -.5f, -0.01f}(d_host);
+        ck_tile::FillStepRange<AScaleDataType>{0.f, 1.f, 0.01f}(sa_host);
+        ck_tile::FillStepRange<GScaleDataType>{0.f, 1.f, 0.01f}(sg_host);
+        ck_tile::FillStepRange<DScaleDataType>{0.f, 1.f, 0.01f}(sd_host);
+        ck_tile::FillStepRange<YSmoothScaleDataType>{0.f, 1.f, 0.01f}(sy_host);
+        ck_tile::FillStepRange<TopkWeightDataType>{-.5f, .5f, 0.01f}(topk_weight_host);
+    }
+    else if(init == 1)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f, seed, true}(a_host);
+        ck_tile::FillUniformDistribution<GDataType>{-.5f, .5f, seed, true}(g_host);
+        ck_tile::FillUniformDistribution<DDataType>{-.5f, .5f, seed, true}(d_host);
+        ck_tile::FillUniformDistribution<AScaleDataType>{-.5f, .5f, seed, true}(sa_host);
+        ck_tile::FillUniformDistribution<GScaleDataType>{-.5f, .5f, seed, true}(sg_host);
+        ck_tile::FillUniformDistribution<DScaleDataType>{-.5f, .5f, seed, true}(sd_host);
+        ck_tile::FillUniformDistribution<YSmoothScaleDataType>{-.5f, .5f, seed, true}(sy_host);
+        ck_tile::FillUniformDistribution<TopkWeightDataType>{-.5f, .5f, seed, true}(
+            topk_weight_host);
+    }
+    else if(init == 2)
+    {
+        ck_tile::FillNormalDistribution<ADataType>{0.f, 1.f, seed, true}(a_host);
+        ck_tile::FillNormalDistribution<GDataType>{0.f, 1.f, seed, true}(g_host);
+        ck_tile::FillNormalDistribution<DDataType>{0.f, 1.f, seed, true}(d_host);
+        ck_tile::FillNormalDistribution<AScaleDataType>{0.f, 1.f, seed, true}(sa_host);
+        ck_tile::FillNormalDistribution<GScaleDataType>{0.f, 1.f, seed, true}(sg_host);
+        ck_tile::FillNormalDistribution<DScaleDataType>{0.f, 1.f, seed, true}(sd_host);
+        ck_tile::FillNormalDistribution<YSmoothScaleDataType>{0.f, 1.f, seed, true}(sy_host);
+        ck_tile::FillNormalDistribution<TopkWeightDataType>{0.f, 1.f, seed, true}(topk_weight_host);
+    }
+
+    // permute weight
+    ck_tile::HostTensor<GDataType> g_perm_host = shuffle_moe_weight(g_host, prec_w, 1);
+    ck_tile::HostTensor<DDataType> d_perm_host = shuffle_moe_weight(d_host, prec_w, 1);
+
+    // do moe sorting
+    if(balance)
+    {
+        int e_cnt = 0;
+        for(int i = 0; i < static_cast<int>(topk_ids_host.mData.size()); i++)
+        {
+            topk_ids_host.mData[i] = e_cnt;
+            e_cnt++;
+            if(e_cnt >= experts)
+                e_cnt = 0;
+        }
+    }
+    else
+    {
+        topid_unique_gen<IndexDataType>(topk_ids_host.mData, tokens, topk, experts, 11913);
+    }
+
+// leave it here for future debug purpose
+#if 0
+    a_host.loadtxt("../../ater/input_torch.txt");
+
+    topk_ids_host.loadtxt("../../ater/topk_ids_torch.txt", "int");
+    // topk_ids_host.savetxt("topk_ids_2.txt");
+    topk_weight_host.loadtxt("../../ater/topk_weights_torch.txt", "float");
+    std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;
+
+    g_host.loadtxt("../../ater/w1_torch.txt", "float");
+    std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;
+    d_host.loadtxt("../../ater/w2_torch.txt", "float");
+    std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;
+
+    ck_tile::HostTensor<GDataType> g_perm_host = shuffle_moe_weight(g_host, prec_w, 1);
+    std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;
+    ck_tile::HostTensor<DDataType> d_perm_host = shuffle_moe_weight(d_host, prec_w, 1);
+    std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;
+#endif
+
+#if 0
+    std::cout << "sorted_token_ids_host:" << sorted_token_ids_host << std::endl;
+    std::cout << "num_sorted_tiles_host:" << num_sorted_tiles_host << std::endl;
+    std::cout << "sorted_expert_ids_host:" << sorted_expert_ids_host << std::endl;
+    std::cout << "topk_weight_host:" << topk_weight_host << std::endl;
+    std::cout << "sorted_weight_host:" << sorted_weight_host << std::endl;
+#endif
+    auto cal_tflops = [&](auto ms) {
+        double flop_gemm_0 =
+            2 * static_cast<double>(tokens) * topk * shared_intermediate_size_0 * hidden_size;
+        double flop_gemm_1 =
+            2 * static_cast<double>(tokens) * topk * shared_intermediate_size_1 * hidden_size;
+        return (flop_gemm_0 + flop_gemm_1) / (static_cast<double>(ms) * 1e-3) / 1e12;
+    };
+
+    // TODO: this method we use expert-by-expert view, just for reference
+    auto cal_tbps = [&](auto ms) {
+        double token_bytes =
+            static_cast<double>(tokens) * topk / experts * hidden_size * sizeof(ADataType);
+        double w0_bytes = static_cast<double>(shared_intermediate_size_0) * experts * hidden_size *
+                          sizeof(GDataType);
+        double w1_bytes = static_cast<double>(shared_intermediate_size_1) * experts * hidden_size *
+                          sizeof(DDataType);
+        double o_bytes =
+            static_cast<double>(tokens) * topk / experts * hidden_size * sizeof(ODataType);
+        double topk_weights_bytes = static_cast<double>(tokens) * topk * sizeof(TopkWeightDataType);
+        // ignore index, they are too small
+
+        return (token_bytes + w0_bytes + w1_bytes + o_bytes + topk_weights_bytes) /
+               (static_cast<double>(ms) * 1e-3) / 1e12;
+    };
+
+    if(api == 0)
+    {
+        ck_tile::DeviceMem a_buf(a_host);
+        ck_tile::DeviceMem g_perm_buf(g_perm_host);
+        ck_tile::DeviceMem d_perm_buf(d_perm_host);
+        ck_tile::DeviceMem sa_buf(sa_host);
+        ck_tile::DeviceMem sg_buf(sg_host);
+        ck_tile::DeviceMem sd_buf(sd_host);
+        ck_tile::DeviceMem sy_buf(sy_host);
+        ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
+
+        ck_tile::DeviceMem topk_ids_buf(topk_ids_host);
+        ck_tile::DeviceMem topk_weight_buf(topk_weight_host);
+
+        ck_tile::DeviceMem sorted_token_ids_buf(
+            sorted_token_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_weight_buf(sorted_weight_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_expert_ids_buf(
+            sorted_expert_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem num_sorted_tiles_buf(
+            num_sorted_tiles_host.get_element_space_size_in_bytes());
+
+        fused_moe_traits traits{prec_i,
+                                prec_w,
+                                prec_o,
+                                prec_st,
+                                prec_sw,
+                                prec_sq,
+                                prec_kw,
+                                block_m,
+                                gate_only,
+                                fused_quant};
+
+        fused_moe_args args{a_buf.GetDeviceBuffer(),
+                            fused_quant != 0 ? sa_buf.GetDeviceBuffer() : nullptr,
+                            g_perm_buf.GetDeviceBuffer(),
+                            d_perm_buf.GetDeviceBuffer(),
+                            fused_quant != 0 ? sg_buf.GetDeviceBuffer() : nullptr,
+                            fused_quant != 0 ? sd_buf.GetDeviceBuffer() : nullptr,
+                            fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr,
+                            o_buf.GetDeviceBuffer(),
+                            topk_ids_buf.GetDeviceBuffer(),
+                            topk_weight_buf.GetDeviceBuffer(),
+                            sorted_token_ids_buf.GetDeviceBuffer(),
+                            sorted_weight_buf.GetDeviceBuffer(),
+                            sorted_expert_ids_buf.GetDeviceBuffer(),
+                            num_sorted_tiles_buf.GetDeviceBuffer(),
+                            block_m,
+                            hidden_size,
+                            shared_intermediate_size_0,
+                            tokens,
+                            experts,
+                            topk,
+                            stride};
+        float ave_time = fused_moe(
+            traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+        if(ave_time < 0)
+        {
+            std::cout << " not supported!" << std::endl << std::flush;
+            return false;
+        }
+
+        // float gb_per_sec = num_byte / 1.E6 / ave_time;
+        std::cout << ", " << ave_time * 1.E3 << " us, " << cal_tflops(ave_time) << " tflops, "
+                  << cal_tbps(ave_time) << " TB/s" << std::flush;
+        bool pass = true;
+
+        if(do_validation)
+        {
+            ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
+                topk_ids_host,
+                topk_weight_host,
+                sorted_token_ids_host,
+                sorted_weight_host,
+                sorted_expert_ids_host,
+                num_sorted_tiles_host.mData[0],
+                experts,
+                block_m);
+
+            ck_tile::reference_fused_moe<AccDataType, ck_tile::element_wise::Gelu>(
+                a_host,
+                g_host,
+                d_host,
+                sa_host,
+                sg_host,
+                sd_host,
+                sy_host,
+                o_host,
+                sorted_token_ids_host,
+                sorted_weight_host,
+                sorted_expert_ids_host,
+                num_sorted_tiles_host,
+                topk_ids_host,
+                block_m,
+                tokens,
+                experts,
+                hidden_size,
+                shared_intermediate_size_0,
+                topk,
+                gate_only);
+
+            auto o_dev = o_buf.ToHost<ODataType>();
+            // o_dev.savetxt("gpu-out.txt", "float");
+            auto [rtol, atol] = get_elimit<ADataType>();
+            pass &= ck_tile::check_err(
+                o_dev, o_host, std::string("OUT Error: Incorrect results!"), rtol, atol);
+            std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+        }
+        std::cout << std::flush << std::endl;
+        return pass;
+    }
+    else if(api == 1)
+    {
+        ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
+            topk_ids_host,
+            topk_weight_host,
+            sorted_token_ids_host,
+            sorted_weight_host,
+            sorted_expert_ids_host,
+            num_sorted_tiles_host.mData[0],
+            experts,
+            block_m);
+
+        // done, preparing GPU buffer
+        ck_tile::DeviceMem a_buf(a_host);
+        ck_tile::DeviceMem g_perm_buf(g_perm_host);
+        ck_tile::DeviceMem d_perm_buf(d_perm_host);
+        ck_tile::DeviceMem sa_buf(sa_host);
+        ck_tile::DeviceMem sg_buf(sg_host);
+        ck_tile::DeviceMem sd_buf(sd_host);
+        ck_tile::DeviceMem sy_buf(sy_host);
+        ck_tile::DeviceMem o_buf(o_host);
+
+        // manually clear output buffer for atomic
+        o_buf.SetZero();
+        //
+
+        ck_tile::DeviceMem sorted_token_ids_buf(sorted_token_ids_host);
+        ck_tile::DeviceMem sorted_weight_buf(sorted_weight_host);
+        ck_tile::DeviceMem sorted_expert_ids_buf(sorted_expert_ids_host);
+        ck_tile::DeviceMem num_sorted_tiles_buf(num_sorted_tiles_host);
+
+        fused_moegemm_traits traits{prec_i,
+                                    prec_w,
+                                    prec_o,
+                                    prec_st,
+                                    prec_sw,
+                                    prec_sq,
+                                    prec_kw,
+                                    block_m,
+                                    gate_only,
+                                    fused_quant};
+
+        fused_moegemm_args args{a_buf.GetDeviceBuffer(),
+                                fused_quant != 0 ? sa_buf.GetDeviceBuffer() : nullptr,
+                                g_perm_buf.GetDeviceBuffer(),
+                                d_perm_buf.GetDeviceBuffer(),
+                                fused_quant != 0 ? sg_buf.GetDeviceBuffer() : nullptr,
+                                fused_quant != 0 ? sd_buf.GetDeviceBuffer() : nullptr,
+                                fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr,
+                                o_buf.GetDeviceBuffer(),
+                                sorted_token_ids_buf.GetDeviceBuffer(),
+                                sorted_weight_buf.GetDeviceBuffer(),
+                                sorted_expert_ids_buf.GetDeviceBuffer(),
+                                num_sorted_tiles_buf.GetDeviceBuffer(),
+                                hidden_size,
+                                shared_intermediate_size_0,
+                                tokens,
+                                experts,
+                                topk,
+                                stride};
+
+        float ave_time = fused_moegemm(
+            traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+        if(ave_time < 0)
+        {
+            std::cout << " not supported!" << std::endl << std::flush;
+            return false;
+        }
+
+        // float gb_per_sec = num_byte / 1.E6 / ave_time;
+        std::cout << ", " << ave_time * 1.E3 << " us, " << cal_tflops(ave_time) << " tflops, "
+                  << cal_tbps(ave_time) << " TB/s" << std::flush;
+        bool pass = true;
+
+        if(do_validation)
+        {
+            ck_tile::reference_fused_moe<AccDataType, ck_tile::element_wise::Gelu>(
+                a_host,
+                g_host,
+                d_host,
+                sa_host,
+                sg_host,
+                sd_host,
+                sy_host,
+                o_host,
+                sorted_token_ids_host,
+                sorted_weight_host,
+                sorted_expert_ids_host,
+                num_sorted_tiles_host,
+                topk_ids_host,
+                block_m,
+                tokens,
+                experts,
+                hidden_size,
+                shared_intermediate_size_0,
+                topk,
+                gate_only);
+
+            auto o_dev = o_buf.ToHost<ODataType>();
+            // o_dev.savetxt("gpu-out.txt", "float");
+            auto [rtol, atol] = get_elimit<ADataType>();
+            pass &= ck_tile::check_err(
+                o_dev, o_host, std::string("OUT Error: Incorrect results!"), rtol, atol);
+            std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+        }
+        std::cout << std::flush << std::endl;
+
+        return pass;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_w  = arg_parser.get_str("prec_w");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_st = arg_parser.get_str("prec_st");
+    std::string prec_sw = arg_parser.get_str("prec_sw");
+    std::string prec_sq = arg_parser.get_str("prec_sq");
+    std::string prec_kw = arg_parser.get_str("prec_kw");
+    prec_st             = (prec_st == "auto") ? "fp32" : prec_st;
+    prec_sw             = (prec_sw == "auto") ? "fp32" : prec_sw;
+    prec_sq             = (prec_sq == "auto") ? "fp32" : prec_sq;
+    prec_kw             = (prec_kw == "auto") ? "fp32" : prec_kw;
+
+    // no dynamic quant case
+    if(prec_i == "bf16" && prec_w == "bf16" && prec_o == "bf16" && prec_kw == "fp32")
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t, float, float, float, float>(
+                   arg_parser)
+                   ? 0
+                   : -2;
+    }
+    else if(prec_i == "fp16" && prec_w == "fp16" && prec_o == "fp16" && prec_kw == "fp32")
+    {
+        return run<ck_tile::fp16_t, ck_tile::fp16_t, ck_tile::fp16_t, float, float, float, float>(
+                   arg_parser)
+                   ? 0
+                   : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/15_fused_moe/misc/moe-0.png b/example/ck_tile/15_fused_moe/misc/moe-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..aed1964f2802c4e7f65d7080f338309c8c2287a6
GIT binary patch
literal 76830
zcmdSA^;=t8)HPZ>c+lbyytuZwJ4K7TLveS4yB065#ibN0?ouoicXxM(FXx>1yZ8PH
z@BM-2A$fMP*Pd&vF~=NpCQ?~Z8XbiQ<=wk?=&~{rs_)*xmO=mA0kF__u9#x1-@SW#
zla&xv^ISa6M#{pKTpVci+1!1{gbGLu-liw$0?<kb#^bO@n{)+Sv~e2oy_|mzNXN-J
z7)8dTiJ;@%%i4m?>@7Nd&B<A`%lN*aKfkh|y}b8!)x40CbBf~7KA}6KK2ukHgbfM-
ziXEbJG~@!s0A-rWPspH<ghzz`UK;*yeJT4w$;!$~LqjuL7QtR#j*o{&Lq|8r2VMF+
z5bq?H!#s+Fdb6py@W<CV10TD}E3mEYQls7O==Xzz12`m%&+_v8e+-$?3+VrREy<p3
zv|EZ&T5WPzyIb>j__e_2b~xSX|8}W3GCaJ{=5;lm&A-v1l8B6q{QdiPyT$tM&Yc&t
zkyPfZ-Lc?7)*4ku(xxRrv!R6aeG_BjjX6XA*!pbm!!l;w#+bkQzCvqHXPs{cCAo0~
z?3)9`4wWyqvmDTs|FWD|?eOE5YjD{c-*uXw2tmf#h2F{iSZmsIsK!XMc7AzT4|i_8
zJ6N~g3ajFp3VrH_LT~4L15e{$7GowUF^9D_bJc79Z=@%uiSjJ6pFhXAIa{Y6A0LOJ
zzVERC1q%lE@kwqIL`2=D(XnY5u_*UxS-uf?p0(dqcfLMK#S(Rs-|SCrS%HJ<+V^sN
z47?9$e1-0oT`qc~Cdhf6x24JbLNI}J7a;hWxu?_Cv)8-Mx0PDs-qV%Vv8@Qn_2miv
zv$jn8mJd`^RL|iK?Kk)x<y_s2n$^dDi{($!n8gUO2>i~wku>$))RRwxma|>Pd9}5*
z=~<eq7AGv2;qb`$T=y?`M#%lHCkM&>mj#4O2V#Cr2oaO#l|^t@2|qw%M&@(;69E-J
zLfH5tJ6a+z==x}m6$tQs*hxz>@cvqvs6yItzZpHk+DStG(QYwhUto5)OcppBL&&+C
z>Cl1Y-_zc9F+luN5tkRSp!8**vdMl$2SqOLM)w(c!?4b5_@elQ-2eF^MbluRTA$yu
zpz!^W|Laxm+v8kk-`;Q%-J17N)lN3A%kJo}aXxw9xcLP4<9Vks?z!jNC5KI=h++b;
z$R~`Peu4YLiW;#EG4NCF+w)$_QMLHbE%<3vp^&TqDs<wHzE@+P3g=F$1MHR>yEzw6
z+>5Si{9kO`SVHY7#Y)vHGz4F+r$ELM1_rAu9&51#7;pbPRzvQXj>SZdpr)P^!ejP?
zz}4v5+pEVFPuoW(!a+l?os=c})-i!Nt&O@77AQtkwgzI2O97POXoQp*=L50CSpA#f
zICP2vC-qa-bT6e6^$}3$Q;3MBV;^l+qOAGdt*pJ?ehN8`us8i%S1?$rQn(6!vTIt3
z-3P{cbOpfRp{XbG$>27SpTdpPY!bY)_vSK%bBj8(>KwBOZf>-mLFtU;u~1f4X0w7&
zxVrY`ZY*|!Ummfjq#_29=;xv6_~rL>yW6zi|GM_}vUW8g%qsDcW(Y?9=5KX4qiIM=
zPL645CPm;DPBab1?U$v?uJ?a?z1SLrg8F5HE-ylv^yj#fopR1?nkEMHxP)PC_D+|5
znGk;SJ<^1pzt;VNX!o;K^r!CdMOop7&T&}z=IumzAnmH#oW6<P7e1kOZ$H|o#dsEk
zdf{2ZO7`-?7g9b~5nhM+T9}^Ojc{^4lDLE^GAIC?wg${CEDH8-PgjTwUdZB+m{0a{
z{Rb?qp5ZVBfENaaem65?pVoZO*So9rTdTEx_B^IQUYnbn$-(y<qnr~lZrWk0!HYFt
zb)Yl@1=%AM4y(>1%s<DqnaZ@Ddb__vL!aD(cvxhFfKDbr(Ehdtep1N#f~x#<#MJ73
z!gtnorBP!b2xS+S3BgaZ!yt9g#}{x?Vj}jn21qka-(9;|NxRLHUBDB{0$BPVtsq^G
zco|dw0DU4iHz{e$`8T9k2#O-cgzfK~mMnVBz*XRj1y|4O%hTP_Tuob2bab@v)A292
zqggyWJZ^_o-=PH1N{d_Xw~gL#O@$|?R%tRXiTj#d@Z$%wAUr;G@o)@Dmka@~%OcsN
z*-CAkf)~eT%Pc2Ep{{t}%V9da<x}YSgg$-v#RfDoB;3}?Udz*99;%DVra#YT{%^Xq
zMlgMNxhh0Z43M%JiFnbS&HsekEA$5f;Q#*pt872ybANq#S?|B|C&i)j#l1HSy#~rl
z2@3|z6pF7~@xs7AF$C<e7L8`Z*tm+KqA&%F&!{ekE3Hivdp2twFVH;>&wPA&eS`86
z{pc4ZLGDepFYuq)7}Ve^Q8mJIw>CEdb?og=k;r{(eIJ|w44W~}&(DV72lMeP-sTPh
zI_d9fIx+Xd(1(bkKM5nYmywf0=@h~a_SHuEdX591SK9H7N_taix(B+f7Kb$<GxP(r
zn%T>GVOCa5$tKqWRXxM!L+Ei0#X4R7jk7`ZN+Rg(DkcUSLyj09^zwW)@of<LT=;L4
zh?1$3BHlNL=GiC#7CbhyOnNxv=SxY7e8?IwS3I<>Vy@9`B}GM9(6CxzMWiV4pVVL0
zlgFX%75-I$1>CoIoW*@qy&wgtu}NO}*2^XPd(mxm2*2J_F#!hxu<?KBUQio)hIw7>
zf>f21l)||-ADep&UoX?3UxF9NRH@&33&|H}184dF`B0MO*n^ysidQZbLrXb3_yFbO
zHQy`rXFU7qLg{#U)^@HwT^-SPNJ#-V1@9*7l;g3GoRvwx{P-N06hhqny;l8x0?Oh@
zjZsqY@&R<i<KrFcL8yElr!m*-hDTRa$!vNopIzTxUxtV-<7ROuRU+5~xw&s%pU$Lp
zduxpLc*yN$4_pfZd*SlT*0GJFT!OlkH~v7lfbQv!uuaJPqFb@?@zP1*2dSTaV><^&
zcWI~`HzS;eiir$|XCcmAtooeg>*v8c!WN-a6W>H*mKt)A&?|a{q7mbOF_{r=mw%c{
zXBn>Q+2YpFGd{~D;~l?Q`O0ud{=6-`R#7q;;%nnaO@p6^u$DrtA5RVx?Rq&`Z6}5`
zB9p-C70U5B{hbh<hu#Y}o+G&A&C!EU57fd#quKVYFM#I13Ev<L$R)Y&xhOo$oSuuq
zOns;2L{2T-JkS|lMU;_yMUd&ZyrNpn2*yr|V7f$!eBV6ccB=yj=POeAMRbS~!45+z
zW*3*Ep;oG%TmfT@m>Gv<Kw#g!m9^vEM2+iNX6PYa@OHj05(k7h8%*Ctm+Oly8gpah
zwTQ_Y%;{u)ubdonLw8<VDq1g<g((?Aj+?~Zn}kl9A>_wLL<WZaC3@4nkWV639aiag
z(f{7jdRMdviJhE@a~lTb4~%x$7m;Ymn-wT0QDS3v6(n>+pSEPPM<OCj1U)Ro+WK!%
z<PGCJs~&mB!38uMUr$O`?L741D4`K?0YKxV9?LN711tTSj>hNlLm4v%X(caKJkTR6
zZWN0eNBl_<MNvYM04ql(E8v!K92sUfU_N@HlyshrVe#<uy4N>pt}M9|l&XJZM3`o1
zAs1$W^9GQm^VzxRquvw9sA&z%|JSyiN`OO*4_7ECK0!C{;cbijP2}n)A56WC1OaQ?
zx!C>n{QDrSk|{aSsrI{7A1s1UOBlqk(alE-ZwL#-G!Q8~3}m>mi1|U%8uUCH983IZ
zpoYwQKR-$y%_Lr#;4vyuQyot>NFigb;6wf<WK|p-I*&^mzKYOR?FGC+l^y6#(0&{e
z;)pgGL{gdNiJ-EQ#v(7?*o)}xvm9c}U>5r{B~+X@BDoHb#==hPiUR(o<i9_^go=;X
z$0HfK0v9g3bO=<%DN{IV*FLbD4&;Sqa7T#1jKPZ{$E}k{FesZ})d9Gbd18sUQDo_S
z5u5`f;EOHELQF%H?)MAhA=_iY`@-uKOh|+LuKNKp?-f>`@jr%rj7Y^Zz=h;l=K6gs
z;?0XK)VWAXNb27v?xURL)nX&VrFeAMAdM%Yh9<i5pre}m&$vKX_0pAS){cl<^2;h^
zrfG5$%3nJobU@Pr-h6i~gRWhmkT|unD!tG>QF5yn!ZbU{UAcM%aJ@IaE9|@%n$lqQ
z&Q@CA8!BY;&FFNnU5v1_qBX>oi)}|3Zg3uwq2qPnot9ZI#w#KoBci~VN}8;PK>+>8
zP?;;#2KR`=1G9lzjb;Ow1p7w60yp)RTmEu;)@qv7LWRzLHAt@^mD&uMxje|R0ah7<
zo<$9Q87BlD3z6t!)J|DR4)%sHQAxua3S+o<H`4Wh(V&%@)%PJLYH1W7XO*1WLgJ`H
zk(n*=Apzt*aw|@HAiPvWDXwH78O5xWs3?aqb~DDK0T22rir~j>(%i`a;pcO4Uj(>X
zB2f7|RTP9!tscA{YM`V-jsW3hB#v^<P@b0%H4LxVyC{PXr+^)iHH=w6wD(@Nhw_M3
z3;p+~Sa@s%ed&OBEgVVf7nD!^H%{hSb?W@aGHi4N!dWY~CrcY?#uET<3W4)THn1Lg
zC&!DX*J8Zu<|9uWD#~|JfpCTZ^PZVarM^5i&gry$G!Sd8tm}9lVFC=iv6TcH8#|rR
z&EX7B&^j_1Nq@95+2lt74qx_;s_S7?K4Yt%W+9<Mf=Lla!(?V_63W1l0_1QjVH1SX
zp;TsF;P(};z3lUmd*t!%b_!xPf_o3OVy{<d)<J{=rUvxo6~;VDqU58WgRL+eVP}OX
z|DfDPVh&+tA+jK3iwKJ{`3GFak_)e9AS7c}V={{E;UUE7h}0Q>Pe6AiFp`lWd&Jf_
zk@g^;jSdSYrz8v24D>9-{>ETTY;s*4Bqm<1f=bVqj2QBTNmT6f`K+!3Xk5f4Kv-gM
z_!-$5#R1tov^x)C#Cy)bs8QMVV3kNuxV*@B88r}18w04Rtp(CigqCVw8|XCI#{!XP
zLuLcZyRqtFub))POZlIHq{Q@}>cZ5zM<F|*B<m<9To_dZ#dhfK#y=gItRy0ml&$XZ
zg53Jd&I7HaYuay#5@WBhO1mXCK09*6;0)u>1WIheC59jZ-iQAYYls>m`zj978bPOt
zq0TR+Fr+702AD^LbhCB~h>c7CSf_X7E`4_y?JSZ4vZ41)#)BoL#{3JcgO!BezkjzD
zeNC}Yio1;}6%-6&CQ6f(TW?los?}9PISuOTb?dcVAHL|_ANGeT<sSh6R1E@W)MO4<
zHLgYmE8!?i4Ad|HVK5wrl9Y>Nq1|Kwf<VK8`($u9RqqJ|qbf9FGc3Q*w~Dg|vL}wv
zmDB?<&YRu22be&N&SCCCVbn#R`dbhuKS@4xUG>CJs94Ec(fy-%R5TnjYSV!y;Yja$
zS0YX~Rv5Asg5^Qd0D0rI0`9295UB!qWjV+F*|=8RYt(bVIqim@2aXy{RtD~Mkp{W3
zP<*Y>L$FQosa&@g+H4ME^TTfDWEaHyLL4`mM~iBaHLmxmFwRkwhZUecq(o`mSxfIL
zk`(-$KBz+4jL;*h1&cNyU~k<ygn>4yhA~ympf9Ivegc*KD3o|MfqC%jzbRB$+pn=7
zMRwidRG$!b^@MpOu0twigIq-qF3zC3Rctq!qGZ3i+uj&7I~)vfbTU*sf)<YAWY$`u
zh?9sLby)6Nt<%j|@FFqrS`FllQHSEhA%s-jv-|}+JvxvmsF>yaS*M61q%OhM1QmF=
zM4!hG*(pLOSpwZa=|H4OIPu<ON(IfH;rmEjUm42`mcMZ1=pKRV*vPfxZqhcP9HL(#
zrx4_if&|Cb^R>p%GCb5T>V6*nClwQlu;|?HFC~gQctqPsYnyB5u06#RxZ(<+5_+#+
z6#@04+#C2nf_$T~D=er&yjQI^@l<pnQM2mAB~c3HqU@f9?=mfv0y6b@E6`CPWD>lF
zFp6j)qW`kTfO(LH1!jnkV0}s-G~PSM1M19oH9Kvsw^n95<iDswE_!HCo|1z&)h`&?
zi2@0*b}SSSpXq{ve<);%IbGCzMK1j?kGJW$J)J?yJ_M65dZx%^`;BQaeu9mT%Y~DF
z|5k)b84$6wUN?Hw#b)$w!#D)jR}VEecsIa=-3A{qpg`nD0+vnnT#X?nl|-qX-8i=*
z6t?)}LIo@cRa3SC>JTt!kDdq+<L2mave*zIGBq_dDb)W(^}1{Q&*VGDU8o-+&ced?
z<9vGmI>G>U6!sSW0-*gw-F&-fGnrEM<8}+68pMzCgcFBpfQBCiz;fbRw2Qdy#Q^5^
zRBfs5n`wGW4}qLdoPX!NIEGrtMsb7B5AGR?{yLF(gyV(PJ2J{J>tLXupszlv;Wb3&
z;P8<HY(~TP&r#3AULJ2UxotEeqVjs9h{93Xkg|%2X24pwa>VWQ1hPY2J`yM4YI#>7
zireQKy^=K}D1R~4MaWdrsib0nND3$M_c&U#*Kqg%2VGHab?GOkorHcOHXYZ@JjdDm
zd(#;(r%5nRs6^mCMF9yUPb^HNOI=exlrq$LG>do2o8%yTFl0}%qAahSqFR#A_-E<2
z1j){Q>u6p^C`oP<Mn`gfTRd;5G}$1uMT}J=VW(r;Y;GWD`hzqq)oek^gzID56|!E4
z#iBLw31Omqq1JLEgUoXshF!-<5?+MOcDeuBC_&2gZW7J77!>>$h^Gc4$6+QP2F!ch
zR}v?Nz(Y>OTEdh`;eq$HR+E(1VO0+(bJWAw9aJLhnT-%|GeYwhEtMf9$Zu&-?+zGR
zqRsAkK-ko17Lyy=(<AouKqJ{kbdHhG;&zrmh-zjU93ls@0e}b#0t>4Zr0D`$15S#6
z{OEBWl|>vPMME?v6T%NYUgs);3H(Ngkg@1s_l4FmC8AS&z~mcQ5ejYWEi6rx){sSg
z$UWL{*q#J{SO}l7=l%7M1cW0B4Rq|xcE7wG6Bcp2dUA8RawH*uxt*UkI&z>?Pf}F+
zyKbg!xnr6D`o2dI1pgkJ-f3V^x2psNtrSL49$Cnpn$ehvYqhoM>yO*QZjlic2`lP|
zkB+QGU$!X=ZhOtX32avc5x#0byh4XtR_r~u9Zzc3!$D&_@b`XLSX7Uq=`p+3-X!<R
z^*CxcM47nB_!69@s9bLJdP8mrh&;4?2m2q!bA=5hTG^pt2-_T%X)c%THXudA)5llC
zN6{eRY9QZXgIYZaxqt`J<={W){^z!;<|pBoAXQ{?WL);2ZbRkn^=rq1TL$!wE^*il
zL?=WyLCf7lRQRwg(N`c(EOPX`vuG<aJnI=S)?$HVo;*CBD8w`;UP{oo2i_B#noJvr
zBnfxC{QC-O7?Rfgq-A)|yV&GFjy8HsY5}z(2*d7%5cWw6{#MPstp{To1rAXpQ}_<>
ztHw<iIN15Evf|%RNcAzFiysABHj`fSomX&92;kg)$+?JNe}kHU2_M>FCFraI5D*Y%
z4X8#D-J|~!2zqmMa43aUclG9d0Ch=XeV^=vfJ6G}&(F1^cD{C|+t~hEsJvpR?8MHw
zeYsnc&W{W>85L${SGmR)15?(M(}sKwoiF<8S=yI?TkwRTF6A2fZhSco_HUdHJqt@1
zdx)#tMhC<Z!cysOWi9+iQ~>oCB_isEH3hi|3y~Q6G&dKSJ_WoPXs>r@a!Yc!3m{qf
zE*Pu0-TTJ+Ctz8C5Dcdo@XhsP5pPcT2k$RwB`7n6@;Gfu@VHhu$#;Lq+r5{<t2bmI
zw!qE3V9vs_CC&dbMF0KXt6x?!Nkkqe7`Vwxj*CHy4W_Lm`Btg^WV{ARhdL-<JeAY$
zV@`v<>O!l7_2@|uFn5Dxqpp81V$tE$%Olx_Hk4D$ftb=#C>mQ5CP{1)v;^X&WI(vp
zbb=suhn<P1>7$Iti4sI7AY?OGmALKWS)<SaG=qG!rQa)r9LsiB`G)chC_inG0>gR{
zdJWUzerxmz@I-K#_~#q{xD|$as>;g=SL9yX-%4`4QOtu@e*?lO|AEtbehrXJJ%V`~
zvKN5#{)*B@`U!>smX!8_a3KCtqGNq9UkPuieBPLiiV7|gbBzOVO15TUZm!)xZf@`u
z9?g_Z@WUaIXlDAbdUqib3>V%Q`CkI3ZOpEdt_O<qAoK{9RsxnV@h<(ItLS7*a-n6u
z333)#mF9Fb6`0vF7p$pBAPF<pB6-IJr5qY1{FCTTRBS(myiBHBl%WKKOF3op7X(}n
z*k<(Qyo^|k=jgl|=ciN3LECyi<t-+XN83_cE9H`!1p&`<>X_V{`Q*O<hSt@^c8A4F
zBuPPpr2OvnU~bEC5-kiJ#y^Gv-9KSDrJCWAq-Y9a^WI|Rv^#ux+sNTZDm}hDef#jm
z{&SyV*?xbwYyXOq0xo|HzGx(YM;U>O=MPxOJQk(I*P0oEnzNdWTlwx);UWM+tWbiN
z<7hg2go}cF=!fX+mA-U!8O3ih1UAw5%;z{fOundtDAC6&t#<{I(LG5-9UbH;=*A+O
z;^X!9vDhk4#_Co}#GmYKA$c%n6TUhmDo<F(6p13gM2W@QVvHI$DchlS9TsN~)Db%<
zF-ev!|9$gfWdWpCL%Os_1fIh>;_S93e31dvlaJAE3W@D%*{@83Wk_M&>$lIRqe|LJ
zp;qUEq}79pwQxvPP~>;+>KEacYn)1qpG++DQXJL_PsK(`7zn7Jto1J)-(K$GN%=pT
z;RUBa?Ykc7EqERt7uQAV%|~$l%h#`8&krD%3H11}MAC^#Nn_vaWX_=VAYw_(w)&#G
zBdaGSm(o4{FH>ttv{&uH^Zp^uQ8;|_?9WCznnRD^FHiYq01wA9$Ig&})hgYlgh&pJ
z&nMw&fU~<TT`u(W3f_`=UnxkFgW)q<+n2cA;0UBy^SBC4(I+SEd&hfUE8bJG9NP^C
z<J8#&wuZ77HpM>c-3Sbph#yEbfK2UXM`N!=@}Fz0$*oWg1p#ASbyFKzSY<kznBdv>
zr$1E|*7Zo6g$i_AC~NHIsf}T8af8)o!5`+Mlp#<TS3Cw;n)YMA_$ORSW)%7BfMOY+
zl#t*<=`W(4<TJ=6IAF2nTk)UDFEW`A0*DO=UIEt`6An;8itBp_^NHbcY_~gN5mQ+L
zAitpvQDkU<8$xD@n}-EOEa+G}`gP)>c`IMq%vOFE^l41D4?ag&{6LK~1A`oBD%OkY
zuO6c|>y;Q}gCwzxBYyHeK7ryd{Zhn50}#^;{q(ayZ$;)}{9iIYSNg}7<~+#c;xt&L
zEdhR>chuqW`oP4UI*Q0xbSMiGhaCoeA^bKx{!VHdd|`x99+^xs<^y1b`Wv2>)MPhg
zPLJR4uG8~f*yQUwBF;L2uS8)j;1}-;0Gb6Nae$K;Rj<b}<{bEfh|_p)7P(|lqJJ?S
zyH^2A^ZN^U8}%{7SG0UgI|1(t_ldMH$_scJqmuDCW3%6sEPik?shayFJSmWRi?J^Q
zu#bh9KZ4|eQXvxA{S2*5ElJOKHBQ1cCB9c0v@h~qmBA`|m^j;18Wa<oo=l97hslXP
z8Kam`w&Q`Cpcw#<3oeVPe5_KKT~Gl<G40oB0AqlRi1{R*h^8zLf(yNn$@e^BbR{B=
zT}+q254u<MWR4mFyM|=Yj}mAm$B`sQ1BjlXEh5N}L-PirYt7gecf|{Be}6v^O^@)T
zPwy{siU00*OEHQ`qn^261uuoH5j7Cle3}AAjj~(+mzy~d+~q!TK8<+F$kgk3_ytzO
zS{5oIGp2{25wsrx?YD{Ne`j2`QGS{n3px({d>^C&S{Ld0GX=g+bL1u!1`8-kX>SCX
zjAU3@`QN*DVg>Sab5jME8sav)-Gt!5pE`Db%tSZ#z6Zw*(MAcH#pZfmBc`Fte@v^*
z=*rD|;32`dx**qsQ4PwFjMaSP5UIDCM#zdR%K%YGm-3jQR2h(nVv@xPLD{&U1;0m2
z)Cpv`Df{c7J47i!S_j&Jxi<w&6Uku?_DW+__DARTPOtq_p9?ZWG7!;;;7cB%ep(k%
zzbJb=$fN_3+-SQHGz+6-HLhF{bSrNjuYi@bv$GRQx1?8RmK~vNv&dQ?#YLjWuzm2^
z*ccQ?;WaY}CLr4^f4JI{&blFtd&ns{S!xRJWWIv>yH+Lp>KT`6TYXwjS65f9ZbxCG
zy5v5YUEw|MRg0LMl1}3KJw9%#=Ldk}RWt@;97ip{AhbtxH*Xprn#CmMsYZM#(zY?3
zB}w2k<;*^>_3|5x2kpG{(mP3Wx$V<<=g!{!)CsK8bDL35AmOneN@;kn>UZ*f!nLOL
zl#ig@jW<fIQ5AtGvbr-pnD&}u6GCoHQiY|@=lR)_MTJ)E>0$|(4V*HDI{)j<;<RqP
zBjT8jX^v1uC7RP;Cwl}fF@=}+Bk6J?C}@3G8gXNGRP;)*Dv)O)ZlgtTG-9X3+61^T
zKj6k|_B22h306N|hZ8ZME~T$1a|DVQA1DcOpP)&sCi;}dY6w0kP7`|)PhHG(Nul~k
z2tY3HEf)dh@%9#N?5<1oU-B}_fEZTxuCyJj$KOZq!$_1BIQBh$Dzmkx=s^vM2Yu*E
z*(fv$86r}|LBeVr&)`N)!5BmRUJWxP`8iCr4lyVqN<xi@UC_jXV|k^e!*FyUB?ud@
zV<tO2f!xXdcO>r|YZ0P&wS!;HU;;=STDc)oiSmXJU^1aAU~`-M8Jf)08`^>y5by+_
zMO(Et&}hVoM!mi@6uv|pD)aAWg|Nze3*wi`g9}Kn;ag-)vFi!z3$b0Gj#0RF89fN8
zB}x8RP#uxfEcvkYNdK0wJcG+V3x0Pc_8|{0#G4})+ny+tK5Rwo@r5n<V5%g%W)j!3
zKwV4lht^Q|)_E-`2z{%oyE_<3+(@E|LRHFKq(}*xmKJ_l>M9M8g2ZaW4r?wd0xqFC
z%LhV_@aZJT`v%%{Ei#%6517NzLDgiuO=DxfyX{8C@@Yy5QYKy5KhMT2T2Ey%IVQE!
zEkb$4Wk5|4`wZk*`Ac#xjmlL55Ge*eSZCn>jo0o3x7R);C=+!9`#!hr@<=>QZisM(
zNyd^162<j;A3=%yX|NxQzUwLPuXsx$v>lAa|1Cc<gKCt!;KgE^{L&TZ4m$^Qq650b
zl8?e^|H{DO;I^Kozh<rCz;qGo3o{FWpM_SiGIP)av2h_@7xP!#r;oWup4jgLj}5$f
z>6eh0ofNnNyNl_qDJ5ZUO$T|=aEuYGF9Vf}m{i?+&`gi~0-8?awiR0%oKa?-)imnO
z?+sDLwVJj%(BPFu?!rD#;r>zp7f|Q75&B*%RU;GAswyMue5WW$5xZtTG)DUdxg?*V
zzX0v%@fF(l%JzSKfTp-!N#)&HZ;9%EoPIs#f>K=<hS|GM4yz<_V$&H9c&`$_^F{OV
z((wW4IG8p)hNl#_<4KfN7yQt=k^o6Kl0gKq^YM`J2p*MyE*Y&D#v{-m<hOFMdp)^D
zFasFs^{Z${R8`Rx_UDg;L{mtJtEfposcg~vE;F_z^fxlf%F42_xeTOe;Y`<m{SuN0
z!kOt;HRj^e6P8<g>N9xiuLE+uOGniq9YxrU2I7qoe+;((7QF2K1-O!^FtTZcsPTK8
z`U(PCh$4_j2^rZMl9Yr9-%HSR$M!Tr?k6Jy6(WXdK{bf)xv3Fu&@d{re_7l!Tcc`*
zIZ~d-r`%SQqs-uvYU2)RTV^8|BFc;1bhlS)|3YL|uoXq|3B?Z+zzplkYfgB#Zx8Kt
zSNYhX^cC_D*n2tCq|(ZDzaT41wENsi?O`*(&R{u(&7iJZ??dut6V!0|`v{r_6eim?
zRP(!mwp?UKE$Ykf`=kGeIf;6<xE;~PiAzi#t5jLh(Yq)2?v7<nI%qXm<9&t|TnC;=
z2U19hYHv^~lTK5_<>{-}){x6#5{4sh)I+BpaG41Yq(1>ZNR;*b79R=SN3oW8QW<g7
zI%hqfwECdTk&LMtv@J@U7s?g}RmCxR0~=nG-9f8);&8#?4QA%%-8>s}qbh`Yz(8Cz
zpr1{Md9%gQY?U!56qy0Lk+hM%k?k?c>y(PvtAw8<LtrNRnH>$O;S|v~_+LZBx$fkV
zEmzyT3eHfEg7l#S8gRdZcPOlvo1F?ht3-y3;8f$t1S%1JQXLZg2%0j=b%dB`3sce^
zVL*OZY-yg)u_1dt{C3&f^nFJ05^b7rWo5%2LS%UVC(wvU4h;^-$}OVUf15#1e*{U3
za4lXhH|-NCRsP9Oe2k<)JzigZmKOArh>}=~_Ej<;G*>D;&r50B*9R4#`{Bt=r0r(k
zTf-+xKm0th)6-jq>WgnOYDu`b4o9mRyJSK<=-n(abkO9u5Xc{@yb(0(#8XxdC9sp;
zSj=WpS6mO#k63+$Toz*)2v_2EI&{1<i|L)bHDohU?lAq5#q03fxE|7lBCe|0zrnEV
zo$&8N48_J!;H0v9=(-Xd&Hf~R_;EooFUE?of|tz6fKV5DiFHXGR$chGI7)9Vyhu#_
zOZ7xLG7H+++O;`QfVTCWJQ?A~9rH>(Z2}zZTOt!BbD?TbS^1q|04@3m>cSq3d%&Z}
zj9j-nY<pln>P2+34R%~ah7_mB72}5wq9_~pMnR77bUB`shlN3!0U;FbXMv?%oobJV
zT0DLPFj+jGaZcEt;jPHpgmEc1CqM}a#?ATXJJTk6>rrQMxrLPsT4|xtd4zeTDq%o3
zGo;*p_ZJ=vl_ow39@cHKN)>!eZcwQe;-*;<r<ur1mTjPkG|{L)&m`4njPFR%a^YX|
z?*c5scHv`xrZR8CgOM6yt|`iDDeGxuP!~U<z`V#G5!AeE2)0uCtQ6c1DucW;riOM9
z31DGumJt^Qh9M;pXM@2RV!w<*`zac=14<0UKnz~Mq&ElP5Z3tTZ+ta|DR47yGaWmx
z6uiGO(TC<{i~;hxH{>2z*Oz{I6iAS=Nkny@0Uv#FLc8f$HHeALm&(;up^t~z8$_?g
zW8xAS>$y*b;4y?mFF1sa0_30?_|EuI;NC+|baWo-i2aTf493y{;Y;)7D}kaVzY(Yv
z1Khu<QGej)|EGCtWe@rl=47sibVL7RHxF`N<4F<A^A4TaY>PD+Q;q;jSf~)u*vA4?
z$mUd*kb-)Tk575Kf;}gVzy1kGlLuDmiBMM204ceDjtw~wqi}j3qSu`K8`u<tjbr*+
zszE^9gImub5mJhKWry4R;WJGi=DKDA<3$VYHmY!7XmmK$zCwPkMK&gAU=M2NV{9Ca
zObT4!xQ<euZkTpx!)XVjN<KuBA~^cFij7VR!4Tc&i3TUwq#HJuQK&;RQ=pAlCAsSH
z?s)Kq4+UjT7xy>6Bm7;HxO}a5L8t{8Mw~JPZ7@-&sqt1ze3HoAaWzI?vEElk=E2hh
zU;bQOS*FlKu^+*FqlytlIrT(*PxQ(MCqYnvA3~FP8KNuU>m{Lpg$G4*-g?3*utb|I
z0lsh73hlhS7EcSWN42l#TrVP8r7nU+{B~Y!5Ecn5AiBvp-!yO*`?c{C40f;;YD2S0
zKT0!oL^n;Ep7V^lX38gjpdE4;f=%%62u+#pqkKa^MSrXj(gr6IRZ!hO9pjB|<0-3&
z6l-B?Qbj^`?o+K*d~hxGMLDAtTnruyoB@G8xfwZ@iC0CsQQkQMo9O+I2=2I}JS|9K
z?;eA0H)%e(l?qUPnhDy?A#=m!KwkUIpeOA5o+jQTJXjnpE=hd#k7?=*v}_G)gd&|z
z>o}cp6~MyxZHVI#3j=pS!WZsafJ4w>JQ25C+tjExce74CN%HSf`y%ZaOi{Uwbmf~}
z`9TFoS|*@Ws1Zr7%H)1r7k@av&`)z#j?i~pzHZ#rF>=ymppz6uiSXKP!tR&uu#GX}
z)FBKrWSs~xDO+QG7$)dQ1UzcslH_%;i*gCu7IeZxES1upCgd_iFTXgiFxV&mSDc&#
zx!>V*S%gaEIKptiFL5NS;`eoo-@oga*$pSsG=H$3YI#R$2*MEW6<vP!7D|my$N{g&
zW`->8wH{aM9b#@`={bZf_YUjbt7MxEDJ^&4oD^~)eBV*I3e$Oh#M6i_Aow=AJQv!x
zKc#VZ4R4hWls+~n<TDDC!B&GG7Wo?CA*^vu%bd(vPNu=IN{QYtrYj|_*LH|JC@zqW
ztLGd>4U&?8An!=E(XUzsEFq7uo*C>4_&@8#_-w#kh8m$tG3Ja(K7&|LI|GxxvDVWz
zqh`hXhAQ!kjI_MHK5e`tlsO7V`iN3ZvQ0k3@!a9i<$cJ1D1<k{+$X2P7BfZ8ZES=(
z%#-@v#XWQm1v>@QSj~mZ;eCboC^~U<K{Vl>RsGTLw=y95dQ&a#xax2&L2!nhV8Kfa
z;(*AW;C%XV{xg9-0|0Prc)HZ%8ZKWHa(|A@c#h`?+syNaZu_(u6_*r@IS|Pxe*AQM
z;?s|(DWP}Hgs}Im>o3H8T`ix7%?PBlf%DELwbV)&R5ZZ8_aJUjz;%;G@?NcNAU>fm
za8YUKLbU(rN%jT$Z#iZ|Zo(FvwJ?_Qlk83YEsnD<oBc8mJB&MYRtlQ%>HNJhNO&M^
zUM@9r{9c!47a9Cwad_6kpg|<)To>M>7qrnqEDZMbau}ex2@DDf`e`}{ohV&_PVLQj
z2_ltE{BNCt{r|<N?*F?;nQYV9KVt|6h9WcGP4<fgDms-(SC+wxN&ryN_@wcu3$eSo
zEZJJ-T`Y;4Vy70}-UQ-5W5_bug==s)b7#C-+c)i>c55`z5Alw+pBM6Oai9~+F3L}Z
zDF0m+yhyHI`tr{>Gs*O8sn`i!2vBt|?^gPstqw`xLuwE%|92;+c}rhZLjSw1=t(@o
zG*}D(a>KCpSA$~bfB9F&fiXYpbnd;Y_y0rMTSGF>IXnFDOZ1;d#rBbF6LJvwVkVnf
zROIQghD2+(>{Hu!l<PWGOFVx1<@iFhf=gjg^JA7oInuuYJASa=$SCPdFFQ!^UodX6
zTXY$!foYf3QI!iqC4Vko<l%mJ;Sc*Hrx@X867TbGWKEPJSsgEfaXy_2Zr3fHm#oB{
zwoi6qmUaV05(R|bNZ;o?`fKJod`Z@#yKN5tH^ho;PKQ}YC7%@>`OSG<)z*$Mxf~K6
zU$c9Cn!e&6nY|MJw~siUaDh3dhO0@LXZ1@}gWUSkdv7_>ivDKz3qKXNa^e5o!<HVT
zU#ZJH57Eh<X$v>u5)UtdjS575?rSMkL?wfKf&SS4?LW$s^A+psT(X-FMl)%#zJEmW
zr@U3ajWVSe0+OTY7q9<rDc*K__3X3uCNzEkba{D(;bvyHKkc-)$x6jND*AseIv)AZ
zZ2#APuX5$onHO?2ap(C49?l!AF)Hb#{SPm?ZNfIgf@i*TBX#9p(GxfQ{SRkp500<r
zsQ*J2YHoSX<(=loW8Nb9N7DCMm)Ap}?vq!U*njs;buXRAZCwA%ckZWl?*|XGv#Ca}
z|IfMDxOr+_eYrfJY7y)|cu@)XH>NBcE;GByo*p%`%R@E0|E;2MKXIJyW%Az(8sOVc
z*g;&mp&Hx_aq0iWL{&bS=}GbuSKjwUAd<GFe34U!cj>xxo#damPGi%c`a{|!u?<D#
z3!;c>x$RG^*J^^k{NE2|l>0=RTMI)!m;X|wck-y=i181AtdXsS%Mkv(C_L{rN$1~N
zy8-^Uycv;mdz8UY#_-yic!eveah%9B*g_pA7~Dcz1|thdtsH7B9770E7TX|UfyumB
zaz4$s;nC*z?X$qtNF_P=j2BAAbXLb9?VZ5ZuVLSF&Pautq4r^C<&J(N>Yq(;3#b1o
z>?)&Zzdd9N_ce{d@gPcpaTMctfkJ#Wj`1P<?;nh&Ldo~Try_q;!d|OJV$sx6QA_g#
zw<L;1U1a};BqP=}+Jw7HCiv@%4=-diHWYKQ#NyeyyS-`LK<okDWoX3SWmE$_=A;UX
ze<I83R?Ml936T7}jpO{^J`$FpV=A2M_MckKYADL^Q?>yHo6kPtMRJ|_RQ5|k=^&v+
zyuMy|y@}gq&vvE4BtM`A^H^_dtz%K93p(?jQBM8Q(M?z$V^?g9e_i(#wIJ}1|Nb4H
z5|Ne7{AU(M7;ej4hn>$0rve4=u7y-*R<o<Cb(yx`bSEwkB2r_jeHVD^#>^*dzmC{a
zwQfR2(*E>4)W&(E`0>w`Y3rIjM8LeGLY4hGj!{BieJ_`EqAeuOD4^!O#(?|&AK)Bf
zs7!)+oV1-A4~?ZeJ2!D?hMVl;yd5=v2d}!}h`YM!TG(i28c@DJTvg~$h(>#<z7B_}
zshIPUGimwIlQD6$#HKVd!dF?@oe$RS#UICji;IYu78$Pl+CrygD21&?!`nOgqXawG
z8ETE`yv3>4`SJ{KTsMnyT0XtjRdGnp)h8AUALSG<EFu599^fPdtUqO@S#VSY1EGE{
zJIGMe1B{`bEgY4vb^65`B<t^BnA-x%lPjjaHr5Xe7$%-Bwdt+28Z`386`1K&SMSm9
z7iJ=XMwc`nnrf0stLsdV=`Wz;v6l4JREXHq1te>SZ~4@qje#$CnnA5cgwEPEg7<lb
z!D2^XR-r$q@C$@=x~H4EPvPEH@wSY9whl34T=oaH{>}$Hfi)70AAnRy(i9^tC6Fe!
zP}^yWe-<6*Ur_i`o|t}k=o?1jdyW=(XQy9Ax=0fWXwzH7Om$;thZuV6{nX}%6dR@}
zlqK25^_Fp$^boFP;4I^Ap<vN9BOndO5=IOhHx}|3e%|Ld`CDK+YQ)At+4_`K(~M0G
zYL4!l^IMgRo=oV9!5?q*6E`8<`qusE6)w55>*&s-Eo+0FNU@d^3u9I538{I^CRO^$
zZEYNPZu1-InO`%%@>=8_5>S>NJ(EAfjg7Tnm;9RLZ3dw1h`eRoc`WcVvt3sA;;QTQ
zB(O@p`1~A!&L*RpXhv;kkK-TOEEAWIe44aVusc5FkO72hB%kREj_k)ssbVagKV-CT
zAji4CKyXsbbt!yG{Ci<RZ(%^%kB$u|A9&GIX4*nu6{2qn1gHrN59&-Dmj7bBxbzX?
zETh4o<!uoUE#mXk>(0VT%P_&C^^m67I?Ng<%5d)XeW)u+&2bR$2pA>T#t;ij38S0m
zTq4YNmXu<tU-k=JA$FU=nR{a`CcMV=#s?dgm*J&-?%Lz|S<x=Jr`jzw9iWWlT~pC<
zgE}3|pJ1ers$b<zFmZOXo)B5Y*sVuEdKZg2^P=lWRXCEfFtAXa7Dt(YuA=d)(dQF$
zDz|^FjR7AzklWZ{Zt&YeBa;xMedLFYByY34W3@?E#gzz+kZfIyVDdOljGDsC_k@3v
zM*^S3m`&$oQ)-r-9QiG=bJnl9R+F&f7{)taS$Fq+O9#gwwMR4W2LGk<I`-gzc4JiO
zaImvuJ#Rw$T$6DJ+Dy1B?^evXlXGE2RMGC~C-1-IhR?vBngv|VtLjFJ6UrR&`OcYE
zb3)jt-uPb7vwQeeQJ|cv_mC2o61(dnSWMbUZ){;t|HWr#3NIM-Fu}q{O8(myZkups
z8gUGaW29TVTRVd=)41JiNk=1GvklM!8Qma)^m|yLvoHjmTQaKCpS*e}N{z+lsTLpK
z5*)^C^#sg#x;^`hzAV46ppQ?mW@`Voqq#KSW)bRNxgrS>?{?Pk%Y0v<y;Q(;5%ez)
zM?i}N@9J!%g+yW*F$Z^t^Qv58K1{__OiZDXy4E=_lSbax%j)#6iXT4lH7hLgEZt&F
z9lY?*AvUt~7DHCt8V~J9Y?+DWcU*_}W&{xi)VEY|!+%|Ow9Y;@F{NEtDyzj@Ws;o2
z9MOa(qP)oCdtrL*+n4^B+Oe7%OkLeRulJ&o`_@<KBZ1dqq8Vgzw)7jZv&xFl?`t=K
zzMbXZYh#houvNLoi?8&rTRCTOs`ubd%V9NLIa7Ef=UT>^ol{NB6ZiUKmfg*-Ibv%q
z4>~Fzg&50XvrnEfd2j4=icVK3m+bOU&x07>tq6wGJDF!c2@=0#z9&Yo5=fKva1(BI
z-7hHW`gJdD+8UI=L}6X4$q~6x8uMRt!b`0>i<+Md^57+>fx88;4Lq8Sia5c|2q_c$
zJ#q=){*^aX-cr7dHC>J~;(iN8R8+LsCd)<)11(fr!;Q7ytDXYnY1@as-d~&7ukIU}
zr<^g@>ExWLq+5leOgMAi(r~>~-zaT}z*h1OyT`6_Y|0`zs4q*YM>%2ufyZm0X8*y*
zIG-qp-=LGYckGQD=?Qads$3zF2sM_To@LXguc!wN4BINz{+!z%X?ho4&6dZTX%pKm
z8WSh~OxEjvHT&fj+Wyt1KZPD4I499ucnSNjWS!{xx;<5h*kWGccjoL9#gDl>_p=ki
zZRe~-EUqgvCuq8=;$>VIOZ#he^PbjuUvKgCvzYl(a`x{Koo|UUnY2YuJfog7)6@)K
z%l^?u;iXO|pS(xiOP;o)8MWPZuaZZf772W3xdDw8M});&dy>Cb`-O*1dP(&DWbYrk
zMq%eN5gG>$HW|Zs_uexNmbA2F<?{qhAaKoe+NHF36tbTw{anaw)Kn!R)c5g^^xxxK
z&@_n=zvh|4Zv#{xXmiU?tWrgG+o?BjAMM-D>0*F)mKya|eu*a94fwU9aae2-|AOOQ
zn=L+E9it=(bmUvStuwFZ3I_^E5vE2&fO@2)?kSt=#vz?j%rV?<-G=4Ap9GF}>Fb!Q
z*Ob})S3RG}W5NO~PtLsUQwzoYeS$5zYl^z6X9gSv&2&{_ZjxekWjDLKU#lQs6%*rP
zTcbYXanCZfr@5ylCT^Aj`D*)MiO<URq#K>j!8zug>8QYRd%S;HnSZ+N7U?+-hrdus
z`{!*1?^AmPILYp#{tds*O?@sTztW?13YyF3cjor##kX3sX%O3EYmNF{O;to3v0Ent
z6^GecE(gTnwC{Xi`Ded|A~|IoGk*(Onjiky%C2}87k}g@df?Yi6X@pR7T)gasrSY1
zL+r^IA7BjqSnFi114X!=+v@PxHA9cjA9{}ZKNo-2y9t;`o&1wt;lPxdG`%UXR=yVV
zJqDvGF#(&UtFC;5MEmBb#o7U!NTtOQW^uN<zSJgD$E#15Z6WkKt?yme?MxtP#r8d}
zW)o-1UrQj&TB`YB2lD{_kbBz2I>qmAW<p2{jRKpqu-)1I$JHIZ$#r`#9q6btl~pFy
zxvg(Ba-q@K{Z~Y9sWd5Py2@qh|J0a8r^;80XLo%3nDyNBD@}@5_(*k)p_Qo8!dlEc
zMWoALt$wZ9T-Q!zMqRZZy((j#;&YA%9U*W)I$3AdkK0#MIL%9RnP$HH&1hM?3ER|3
ztEEb0+ub-@DB2;#O*)IwDl&iXb2Xl#sLDF^nv%Hb*mz>+uv~Y&e#`wz%J}((lV!f`
zi({)GoN3%@YH<YU7ywwjL+z|QWW1<)$u&rz;>XF#{zG<q<bD3o&u8_!O4P;bVZ-hf
z<Ah_D<6h9&H=}j$cX7N}TYpD`OML-J;rS+Z414_gvaBA`dt_Ty>8DtlP&zV$?wW%f
z8sUPL;`Kv+kF%?hRAK@bD%TIL$mHe7ZbkUBU8ku3C37fjeB|}oC%z;S0SGTn7I5d;
zlu&X7Sq#9YS%$}l{J4S^p!1wi;r))pzkWy5E6~(p8O@Tozka$xGGAY-wjIO?kLOWP
zsBcKaKE|Le1;2jxFgrGLc}x6LvTM(FPH7aS5_hdJB`~`pw!^cD(>C&8V1{eBb@$WJ
z4<9MVsmA!KW00R?PWA;)my?(@Cq*qbPwi9_IEP&ll5J+??x@j;e@-;PT6aenjFYfI
zwnrq3ULAg2m0^AyTqNHZM#saCLGOpFntr6uH%34!+Ocvmu2`}d^h8Vev#0t-$aSnY
ze8;K0W`5y@$%bTC`#!sJK2`xPv(cmRRNH&`jQRVOq!}@gpImK?;opkMqg4Ck6XN$1
z7R`z7#FgTqFb=P@=gO4rv=Z`SrR&z*viV>fxEG<bzq$SRpNg-hTXOB>KcgCGUpnXj
z=N*{DrnnV8&wNT=I91zBjIn%<JaE>-#)$lVHvkt@EKJPz{+yrxx`~^`PvA_OU7zs-
zpNU7_Uq5h+$0{M?nycy3JC8Hez8C?0&FHtg->SEiS?jilX}_l4={h^!W|ez?8lA#8
zNhM*CKFYa89CMwy3=-OkayGi-dmHE8U!#(8auY6bMJqgh2zA2pO`8SiRBv1&c^uIo
zbhCR6FI2}vs~lOtb0BF)gFgW*xi3uog}NhIfLXN#+I@VfgsEWuTIwXWbHzsVdQV_i
zW6G<(O>8t3*`a4SbD^st1^1_sR%?yoBiwRxg8yNi+S`h5{r#moMtvSjThDNCTS?my
z@yH#gF0alVKAjZ}!YUr8+l*zzkHhdR{??o_c)IU48J%XQO;PDj<hhhf{USk?cqEBI
zXMaO&d34;}xXviA<)_>x-h{Ym8_K{)$#u5=nTdFu*KJH?u41h$sfZEs!{uuPYpHmq
zW5&6tBV_dzIbLh~i5f(4R^`mU8m{_IogFjsHB<pB-0maYHEM5I4!hj!t6-aO+$Ex>
zkM9}FkvtwWCQ^epLemws9x3$;bD0MOs6FCc`l+Q&Kd*XjP5${ShzSce?GX(J@0!AH
zU3AAaB>vX)KUbBrruDv6^-ZMr9y_~W$^7x;J1~l97+Em~q$x8#m>*3o4qQ3vw+ID&
zt+|YTq9pj&Cp!diSH2AVIv>AH)gM*gf^x{@Nb-(z!&>UX?7|PrgD)omYvEw8SJ4$b
z2PJzmhZX%ud6tvRR9x@(FUovjYddH@-bw4FoqBEe(q%jo5auoBF1h70=r671ZhUrq
z2}!7xV?WKh@-^6!=P*qi5_V%W*?zv}&rhmQQ2LfG@(Nn3`#P}bA?tiYW`bjLLJ^yp
zRyq*;4$0owW)TA`O!c-y$aH08;GvE^x3*5-k9uXnSmdC{&oujm_HFla?FC1VEdlbT
zT4pOPe0FKr(31s?4wt&3jEmcTO&?9v6n7_Utxr{|KVZk<c=t$78GGvZyWMCLBEe*D
zBi^eZaS%UJ-m-0VPB5N_*jL24iJ2&88eU&psoCf&<{HGxFysFr=3Xc<6JXAbG(wO`
z=8fA{s(ruW%CG7JEiR79IS^qO|D>+SLUmf<pq~4W4F9fH-?03h{lTUKI|rQ>!Q3|p
z8M95r>78tptpj9vwn(9D;Qb!-=$g4UgWNv0R!sCqRk@O8xA<oosOBBCjAiF>@m-u_
z@E+|#@qjqE<R)El8u#Ob`X_lnjRb%56wlyk-6t#Sj#|2AKM3Djs(IZOy#aKe1X>nP
zjRf-LUt_P}w3kMfBv_lfKx5{%EtYu*zBH&E>ZAH@{YQS8xn`&6Op($QIVM@wzV(B{
zifL{rMczi?9#bZgnlz@qltvEI%ae_U2dSd>9nH#S*8oRW^_$9cgRvD~`6-c;8f^?b
z{0{@z*v!nJag75X0%Xv(jJ5JpFy_-r_l{*T|M2}sCgnSaDl1C@)0~!xnJc}YP1$CC
zM{Pg8u2NC~0N{<K-+Z-x4WQAmRQ?sgiIgDLGS2b2>Gdn`6@uYULhugl){UvH6TrFk
zR(m`24F1I%{!8<QmInTc(>0Tj58a=Zk@#mnoK{;BouQ-1-E81Yg6@3!U;B8*n%sW3
zc$n_goQV(1MEQsuyh_SAPA;?J{azLfh_!puz2RxIbTqE^SeP(b+1g;VW+<A**V{47
z;XryX1V1fE?8~pgxXCQB`Azg{d^}a3VaBDQxvTY>PP^K+X0z}oy0!n9N$c}jhA;&l
z{)Ep-e$o<myqhxsG)Tx@_Of5~gWJiSATQaC-p^*&YN3^Jm9oRtWdVau{#4>BPa}(X
zzT?koQF~&B3ZL|ZC$hqU38(4S<Gq=5Ma}@)n^EqQ<8ofPj;yHB8m;RodOkP*pYrk|
zOQPPJiw~QiMz!0&x>d~-jz$$NHib@F0+|4zxxt5O4Z-4d`{rWV8(hLP1v^_M_8{Ud
z<o=AchM>avNhNf8Lb{fvTw8;ZKF_=OcD<83pKSe!#^RJ+!Gwkt&uGyhJpuO2ny)_4
zGK_{r8$~go)?T>sNkB(b(P++^`7D8*zncj~!L3nvA=SyG*xMd$X}x3ml}}~>U8x*~
zxwY2wqVw!{rbW9lNgTjCUaE33w;iJ9(~IMJ5utU1gkk%M#b8)|%}p?XoR;xG0S}N~
zX|-y?>=SR0ZVDAx$BKa47Ch6SmcynFD<n%w4z|>m1^pH$tyJ~gUFJ#WJL0Fyur_tp
zdoog97FpRbH9dPlo*D&>_KdlT9a2cNORV&n@gD}dqW{Cpdj>WAb&aAZA}UG|6r?L1
z1eD&JA_4-^J19smp?4CB2uSap&=C+2=`|GTy>~(nHK7LxNlyGf&-=d5nfv9=ojY^R
z7iPlVd+pWss=vMGCo|rA2%9}%Z_UmqOx_et_aMDR_($*;Ai}<&;P))jrj)xpJbq!0
z3NQ9aD>L-+GXWYX_(@{Qshh@I*M2WB%b=s9&+y_!9I>z*kGXbGm$42pa2PQi;Sv=d
z+w&x`>kWPn->r<+&#rgWnktgOU!0-F9jKXhH)&hYqshx}Z1|<g#xl4BTPuVbxKc>~
z^#HW(tyq?QxKsvNwG7SN<c^yP7d5|mRlw)}{pj0L*W`vu|2>bhs3#6vAGj%CG>`Kc
zq!`wV5<zW(;vHNx;)MH!yAo!!ZSnEyB$c;c!S&`NKrg*3z2aJ*ucZrWzA)`*e*2I&
z$QV>2w^qNTyC#8cx`!+BXwcbWjHSMQl|8KdjEfp#_i#AX;?+XF{Vn;H;@5(wZcpL9
zCB>z|CB#!e4cu!2c2u~6Zh;uC5yp)>+y6%wy=l0h;q}tSuTrHH9jOxk3D79In0<xw
z=kMKdS3~bX;%EM3&nsz6?-{2~nI912;^(INKZzim1m67ReCWqRl1sW_aZv-02=}JG
z<0z~7N(1%ipL2h}|4sAS0zf)9xLP|#uQ-Ah>XnSnd$Og0%K!U%jRxHhG&41>IYjjT
z;fL~el-d<OXAP*U0JSLSK()V#tGDt03!q&p3DEEam0m8Cp%V8c!x-arp<x`~ifNbk
z7Zd!T^igoQ(l(9b{ze6rz2>GKTQaG}Z0<#Z^<QZu1Lj>9m?{aU0`mIW51m%A>LyZP
z=^Q(e+_?>q*X6vL$!n1r4wkR9ggE&L<Xp^3Fqz!!WDwYq8>#vw4_;sR9D#t^4w>nf
z8(p*Yj!TR){mlO{*;V*JWMgeMVcqz<-=7~+1Lc2Uko`POaN9|XUU3|$iyuyVaQy+(
z)63g|uK|9i9S4Qsj>YzIpi1d6p6azlPoov|AJ!^WrstcNxLSVRty*c$mvc3)m#z%k
zDPD(l6Hm)_n~L^GOoVnilt;td_i&c-(G>9UBV0R8cDkkt;fKA5Bsd<KsDO+zSSO`n
z4xyC}E3X7+N|?UV3jd2Fg$Vt!dZW{c;t&&tCJcMki%P@@k*fd2-l=&Ok6hpJom%i)
zd|MDj<6fa!n|{O0;V%L$?tk@V0GY{D`)|qQqS^i}*}jEk{a|_^U8m{#sr1QgHTF1#
z-#qJuT%R*89{LN#?b-3V0(ew^T8tlE915_W7LmlgtnroB>fgAec<<MfT%x#0j6Jp?
zmjz%_mxKQrc@cvsL|{F2X6x|9YQo$NaHKNymZC%CyPd}Qpikv+j}|QN{%T%!+?TZl
zIpo-;m;c;-EY~;g`f~xiV*Y6)`7pkVZ2xP5*c<iZu#8HybtnIf`9HC2Gi}*Lz1U%p
zgShAX4|Ku9+8>wS>@XJ>*r;my0^G5Fzq~b9i<Ps6Ebrdi^ev2+`I(?*rW8L(kGMzi
zWZCKb2>=T<p8@<Ui73I{hM+$TvAS#GZ-X)$0Qs|s_Ir<#gIZAI28u{^Tqe5dmG^zY
zv;cT{aivSV|3VR?2MhS~VAePWE4Ef!7=KWV+p_QYZBi{3E&R>M^;s=MYJ>W8?o4+@
zPp51FG%ShzllFizJ(lmgWOiJNiO4}FzgZUUG~k3#Q2Iv>wbet%d{3IPjx6CO$XQMD
zW^Os(Jd60h#^0h0&tsjnGaj>FD070zrU2^)9?g&YMnTu(gji=(RF&$Z=e^`O)1_@z
zjBe6xwe{+a{N~KL1!;TCU<E$ex?V<oFI<<y*|c6<lUFN=$`({-wLd^(AT*lDEI3mr
zV<x{e&m^7t+P8R<s|eTm|A`Zzc6iA}WvZ|Htm?BVjL(Oqc8LE8Dsvgf{Zc#}5L`Pr
zoaOHiTodd6-494bp6^Xlk<ziFUqy!aJZ_7ZI0LNV{sZk!t}@#kBl*iR1$#bTl0Tvn
z=I-(W*Lkk`x}f%cI`^WN+P^DX#v^crUE5~4kCm&CX1c6&<+2AI`g=N5fmsq$|F)3F
z(M<V2SD@jZCy;bgmn|FTn3(q-3&SZ>JN!D=#G55n9?q5p3wCf1mm_J(OH)CdUD3mB
zR<QRGWj<2N3sX%~m$<&UoOM*>CbtY6vnl@uXWCD{Mj*boWx^y2gVDQj{f-KPy|^wZ
z2>6!WKv-B6$1@FgTT_(lgXweM+u_biDcjMH!F8uO$%X};-OSyK3ziE9LT_$dBF>CM
zZ+Dw0CZFF8uhWTgaJe)zbawz$02$S|j(3IR)!Y`-i@5raUD6<8>r>n)d~f-9-ke1T
zAA_L7uXlxt^G{~m>HBn%Fk5+gW1Rx8;w2Gz$3!zC2ryiC+5P<}@<(96flJ?R_`v-~
z6!N0$HBr7dghT>gNOk9c9~XVu$A0D-j~CK~-W~<&UruU3$J$1!-Xu8vQ2$--USynd
zwq=M8p>Orq^nIsP67+)7*JbJ=Ejm(bO9uE!tqy)n;F|cO#_`~u8n)x69d6&+moI9s
zQGNHq3c~4kniBs_@g!_=SuOg#2c7(Vt$VdtwMPfrvrpQB3HD5{dka~oI;>Rxs*Gy6
z9pt8aDQ^%wcnDoNhN333Z)!g6X+YtUxB6M@`)ZcGKd-lTJ>ITBwjMdG)jVmtc~kl&
zMXc6k#Tj^04(m3Q*PT5R2610I)mdR!!NP_qMj8nZQKHMnBqs4=jUkf@>f-|FD4T9_
z;7fyMZpJ73@rHYI6W+W172-T$fOy9U3idmo!hI<BS<AKSai%*a=%uzs!REWVKsHfu
zBWKAV*=!0DMv#qZIYM=3mp9*z$K4p(YIBI%hXPIHG%|NzB|##kkG2?AK+xE!<2xuO
z21;(@-A1do@w1hXlh3(algYHjn@{Z8Bpz|Joc#Vuf^ZCw@(C5M7OFUh_<$C;VJvqM
z?9dzAkqqclDUh-)*aOGKm)UI-8-ufjB8{#@z5Kc?#E0ab4SsIuVwP-0Y40L#URi-B
zA0Hi#^6IE}cBwfFD(R30y}ef){m2#z4!ws)5zycc%J7P(jPLnHqTx5@nbs9)+*duN
zny3XVI&YoEq~C7*y`sx}vLiAJ+O^CsZ&I(E)i{xo$S5iJVk-pjAOd~83Lh!(@<uzl
zZ$AKJ8eSdt-<33+S1HLp@*kI$<_VA^+25d_o3E1A1vkxAe@Z}1sa6OBly|3w60Q6v
zV!BB_^xI{cHLo39@^By6c@j#Ym#fXn5b+qe8eE097*RHAa){kOo%as8ep1{Vdg1&g
z6_N?NaxOvCsE&9#s@WvyU*lmu2KtOP6ENK9OjM=u{WLY&e~9YjZsXnddZ2VhGa3cj
zDoXr40&zkA35N0>XRVAkdR3cF79>YGFcJ2E$GfYnU00WP<y~h8M|SKPMA_*}9HB!i
zmE}a^!~G<5M!#0DD|98Z*-4Bw*leVWDXP)yz2|J_Fwkvlba&frKB?Q#k4Wfh(Q*NX
zH4UD+lUsXzbB@U!r9Zja^8Ln|IX&FStB`SNKAxu%=o{KC?JV!|Xr36S?y~CP7w2-L
zO~!%zO=6?)J)s!DJPfiseWT)wk8Kn`KT9yGQneKM=L~$P!4~PBDz*4PJVsy@ihC}3
z#_F3tB@S0SPo*;U<xQ_LD@IZr-#aG0T(mzK30^1%Y@&a)pY}l?gQ?Ux4F>D4)ljfp
zulk#TADfVePO?Oifq2>s=LQ(mWW@75z*71I<Neb7QG+drrJQPz^7ZlB-PpxdIdA_-
z6Kr`U4Ba$^99iz41*cIxk7@#<GQ1NYyuj7U`=gN`UJq`c9M78$ol8l*I38c)=%}d0
zOu^)1{4+rYR<J2h0Sk4j-%d)6bv1JDzW2W8-4*%-m#%p;?`)T@g5bWaNmgTLpPQ}}
zHr6JKu=+(C=93=9M#K<%Bl@O6QCL44x$o0(0Y#a>3rf2C?+;8DH#0Q7tdjj5RDrsd
z+}qJT-F7%qp-ys{{N!8&oOcz4@*ZLK_X<!vJaqi*FUM7{dUKav9Bx#VgvHu+df?3P
z1L94>3UDH_OJY=WmKl3`@<kbIQvX4;WSc6^Ti!t-xrtX7Fgul;ZC$p~jq~|hS*h08
zAN}4<Je6KmpEstuhOu4qEVHV8yVDC*pZxOq>djxRg?2Wt3_%Vkkd!0^5nM~>9xDYU
zA+uR@$R2Kvx~>T5<{N@Qp?ZsXhXDo8;)ifD*r^>-r5}8$J)YrN7%3*)FJ>v~@7+IU
z<vMx@zu89Yt|@-KYb^d88LXd;vqGVMtW8hiQq_RGKHj$DPl|h%-Q%2V5B2wcS^Z7{
zoT~XVqi;+c_n)j}>Iht)w_pylT(fORZ_0DgnwH)QnQ+W%g;t!wspD*0rtnsMd<J2!
z3T={$S2l?GPyqt!B<Ti;cb~7#-PH-bIqD@C1**T=TWXknS4;>FeV;aq7=B-ihE+__
zm(yAl_X|VP(Z^T`HKRBY4{JWym7zjod1p~*7CF#}6GmIoX(oBipYMha(@#bRwomiI
zfUieS5&Q9ordNgbE+zCA3Ev0HTWa)2$5%4cPY?hi-O6(;3~a+h$f`B^0M{J)hu={N
zrw89_Q01&MgfLA?N>l#VmAZDQ8U-5t7zlssQBSzH719YKrb??j;<ee{IlN-^juq<t
zStATdb@HdfM%j;dDOJG3T0<w>XZh<AI0e;1tg^Gvxy`G2zu4HIO7dtWa=mER;|~K_
zvjD>oC;ho?#bB<{Qm^1mVuMaplFqY>KO!0KflC92m^Kpn577xaZBsv-zq>!nzUehU
z)F!z6J~_(*2q=Zu-#-L-enYk?uY;Re5Bo|`y+vqVkm=X3%|e<9Zpt{a`;uqi8@rJ(
z->q$djg<6cKX;w|o`FR1$)<V%1`vC`srcEk{V2IE>x#cRdIqyQD9M0T=lc<{zZa-z
z=aKB!Jl5<0*>>Au6ZFiXrkH<!b|9GjjLQ$@QtOohbn%@vC2V+O33$Qn`x`C&^0<6)
zi;FW(lYEQ5oR0F$iIr`x>wzxMg)MlP!=z~%aon%y>=L6>7lCR>SWi+yO+tCjUM?IA
z56GVHW@Rkm4BDQX+~rSIG_tnsisojq`gVi!>D{`vLs~0$cYR)y&1n;`&TqGBz>Q#k
zN%%nXWQ6YFk?9i7$m$;6d%1J~b(h&+8CWSxVuI*F+yd;Z!43ZNdxnr?dvQ`KS6QPN
zokAx04G8C)_eUB@k&40akxH2FHsEagsZ#&_lbn$eBA>?SA>zCT=Uvd&!TPHQ-y_jn
z=>?0~S9y5L=r)H#Pvg|F>e}iD?pC<3P~T67CtWO3*o?+4#B9u}4js-9s23d<_Cl{~
zkBSt%VHa+yzCX24&CqH9DEeaJ$0nDmWFG9}f+=>`bw9SDKVQ>nB#hGKDd5p2%qAZB
z9`FzWkF^Lb@E;d2<6J8PU$>ARcdXzWn8hy%O|InSx~M}N1_>J`2K-R#!?@Ba$%Q>E
z^+M05Ym=#mHwAERid9-EYQ-?nt4c3Zed3+Q`0_-9FilfTg=?v`*t*kowrHj5r{iVP
zlv0;K%(TTq4Ay7a)2X+k-M7BjcnWzG@MnI5m<LMTZu~Bj-V|JG3S4Ur@`aAPsmh+8
z7KKlCX37oIjA^U5h!&LH?UDAD@4N&=NZ*t<jB#Dvyvtk9JP8Yg_X`)&iF$0d!dhF(
z^{LNBsGiQUgubW+SkizT2HI71v?nPYPhx;itV%@`-gPVeR!LqEtZnr(SHfOIpKO9m
zqkZ;iydjkr&wO~EXD5*Q34Dmhdh1V|!~W=l+!4N4h~bySKjPV?^=IgWu5MN;JROzY
z6X(fKyyh=6^@${;r5Bynr}Cb0`FD3O@Xs=%n<m5o=Is{z8Wzl|lc&mOz1ZZlBh!Q#
z_=B6ntAnJB8&j!F;BJ4X*`_A_fZXcMR9HdMB0iMoByZ`awCY?y4vY$ASr#5z3e@Pm
zk8A}?gMxWkWF3<}FE*$Ra?PmPT7pm)T5s3Wc|yB2{^l&PrLLK_Ak|#VwnNDqpN2Oz
z*z^=vEdQb_Q4MxG5#M{^3Ge2R(OL$&@|y#{LucrNRrghIH$~}OS#M2rRKnUEB-2pB
zE{ee*x@`VX14&ND3r(GBe&pBu`%85_a+z>+@9_F0uqN@6X-0;7aUJs61zyw|2*Ad9
zytC?3SI|bWVYM!i96E<7QkUhI$+IV7^}irOVzs(<zK&IxiTaQY2oLqx_!at%jdKOH
zewF)@Ef(i3u$S!89VH@%6NUIoUDpj8%M+Qk7!b(tj=$r|Slfoo>UPI5Wgb)6d#vFh
z1J)r>B_8@L!3S6=LS~3=>j5nnWcoKQEK~Nx&{9EjOB}|x+xH<9K#|3axI8~(%o#nm
zGF#F16B|i{bgFUAGSK8^F5)qD(OkXNmC1GP<vBRwb7FP2eQY+xRTa`4yvcMU(Oh<#
zl%cUG=x&>Sed8&w=C2drl78;hm0hkJ{H<4t$7{PxZtBe8%{s5>{9N@~hJ1ECkRz>n
z1hmD7B=jsw?{8S|UV`eN(uEU^J-qcyk@p`Rv@Xt}YTSKV7Tgm~+EPtVmZmv8NmMv#
zW+!G5I+ba$)ou~k2*Sy47aY?V*B&~(I>Ar%is6o1$<KT2wg1qXav!PQwEeR9hDM8P
zUe^h}yXO*7L&F9Giq?NwdZHNfs8=wl#mxPztPzY^^FEsRYE`LP5q|&9-zaM2_y^h_
zF=|lHb_CL8S@r<-4KSfAH(aH_``%x?*2$UTFJQK_ANsQD=1^O2Y*H>_s#)cx&oghU
zOc(ZWkcAq%Hz=jN@er4<>Z1o=9LgasD~XcrA3Z!g3{p^0{l&e!w=Z}hQ~j+@epImU
zgSgocXq*+{p5Wq1`!NO6te(MoVU0v})tH8IMrCKB_Bla}Iay<U9<NL_@m&x?(Vo4`
z^&>U2UpJDX0f(TS{F^C}#kq(L#Itm3)i5f|O(5dwHJ@hE@hmIay?u@6fI!U>TQ`F2
zGkuH!rXBw(_){F^&5zBp>7LEn03xH$e#2zm!LS3AuF~21E~f;}l_~H^T)5NKLfOVm
zEy;{2q|q3<vpR&!tFr2rPKS~d>rG=Rr0xYjpb->>h03ptBGecx&l;l$CDMyXKvJ*>
zh6r#o|9P8oq+%(}#T_26J<{t*1T@V@w(=`(uYsoJ6{73`jw5SqBa{34S0-c&(soJp
z4&<m%&q6o5T_LbB9p@|72w?Tt=~Q0<A+P9-fHfK}bLS9r(~h0FBPO7^(jr)EDsH7e
zZo{_h{b>=?{wG5dAYfU)2l{z#A|zF8)8qh$P&X(usraL0Yd2Ey5BUzEXr}W&4@0$#
z|B2K#KBBg`Ob0N2s{Oc>F#DPJ*J!!i#i)L<L|%UvwSjK*dMoe{!*N9p;o_?sqVfN#
zhpC!5?PDY8cOICH+rGZirZY|~@VbgVeY_wf=%09Um6?Zww>v&@JaSlKtm@`Cw=ZqA
z)X_S>Fae`+w{iZOgF(iu@3U>yS?}nHSH)dA5G;VsmA$cP#(X|bOJCC`G2$yD8BKGZ
z=Q5s3LBCHK*QrWW%ojsimsKXXK!5zEXh#=7Si&px;=FW=c&km*{9!Wq>IGnlErf}%
zyOp1H*rm;7F=HN}!qsN+zVoa4yRAd9g6#9YTnED2{Mpt{25iN(G9bOpXU7_<@rLtg
zoo`|HRY8|yb?ChmlA?158mkiit>v_<`nF`@z06BR;ldEa^3HKdJ}V4#>D!-F+>48z
zGn02n&zroe0lPvVsL%+!ry+>X!g7rc0Q0oly+oIMd+tqpwf_8Vho^C(2!^^qpL_v3
zsex#Ku-p+}51sUTWlD#}9%K;&P*|Ya+3EpE$I1k*2Jr^*CK|E=c`XY2%LAPQh?%FE
z`6sTN+m9!fAI%QCJHj@xS;itA?WfTeyxCwN_?5W$>Bzy*O}5p_^<2w?Mv!WIK0kID
zoe@*bJ3K8RwSOSQbXg0a0WNL(*|ouMaU^#F#dlIMLCzz3Tb@WlAp{YDKrn-yQ^Ozb
zQNm*RVAQR4u-jWCwx!p3%kCSd|MynRou1JCQ3=t0K>cC`XjX(V9CHzvlgM781Kf3-
zy@~nIaqDv1O71b!m*LU5dfj>Yn@)+g7_*JeRmju0t&mM|nzDxy-=9~|E-V!U%y})n
zz9M*oWuA}sbt@U?YfgZeS8ZiPMWc9hTlv*MtTCMgvZ?4~n=2348{JZ9!+6F))+ibY
zbkwg|aNf80uI7l^5S@RDB)a|7MU#7=QVOD+u~Ob>t0%D!C6ep)fOx<CezT8qivzV(
zbx7-3HDi@eu{~u{E$5kaiO82iz=n=1h;kwWlqbuS;;e9KoRmft?)SXGa7WaY`-|=`
z;TF3QPAkvW0*2I|BHE)L{QJX#DvxC3-sS&6pV{v03D0+P2ES^eJl_bG{oDjmonx#J
zUvEQ`ySApq%qa816bkx#3kQ9=$F0zrm(|kxPkCxrL=37nS7Kf9@7<u?>j+HKdW!}@
zUX1q$0*aSGf6yoAd%c#~{GDAO5A18pajcuCniMQ+u@#&~S^gX1T1G5Svr|bEEn3`S
zow=mec5=MEYGD8C9zKK37ZK;d>g%Z;A)&Lbb~N7DQ?VS>>u=<iZMF+eQ`5ZRZFcdt
zenoi;eb}batNEvFxGbBdMmSR48xlE_xH97C{N>L?jB#%Em;(IEukMTkhM%sf2b~Bt
zf|$WX^r1M`D#d^j^F<{hSyJMS`-UD%^cvg1e6v&)ExVZ`n6b5uMbexEF<=1$S(jR`
z1L(_IEF1Vl;}<dzCN1NQq7jPWMY9rQR5Wrz$YT^C(%cl_zI0i^A({@npq`%YV%Ap1
zr>W;gDBL#i9xEhiSAVdV#Hq3xzP7naf3p9spXr?Pi%U*40#gZ}mPb4D6l{^ad_4+z
zdp_P+84fqGzwruo#dsAW&((oXGJJfyjvij8D4cx`giX!Bbex2_`RJv)F*onW5;3Lm
zVjkCn-wc?*)GGZJU!9V(#vd_;>5WqDm2geNW~k5|aEa}#>AGO~%=wWF+PuupUdz6*
zQE7*OPQ%=zGm>ps%DhQtnZMZ#n3MQOZDx`FW<ZQxzce_8C@z=VYZ*J?zSmo6U50{Y
z6*qhCiNYZW{!?F2gXvIN{Pm-!)4%P&QzR<Kby!MiecZE`{-=>^N?HZdg0VJFR)Kid
zl9W(1*M<0Yn_b?$p&N&(c@n)EzY9Z+nKo46FV}9%)_uCuBj9gi&`U~g;zXVdVOme&
zbopemSC2Kh!W=(uCRE?vj+AJ&H5FL+8k(XvL+>uE_cl9@zRgyUj8bkzSlfT`PEzso
zxhYs~LgZ?crxyKdP~VEeh=)wFNKurY`u&G}UeNhnhuG^FVHoN`Dhl!nrgfqd8s>ca
zIg?z?WzLuR3}J$^Fx!fIjN2{KmtCx|EdFVR%teDGaqkXeus(1Rkyy7%;h0u&byLBZ
z*kUC$6`Rz{1AOI#DJ<@VH4<%sxel=zdDmye==tKE<>?v1wqHwr;U}lfs=QF$z7g-i
zb|<}918T>62@A>Qcgytb*40beniM>Bj2+5$7vKo+3)EqMPG64e5?D8U2_R<B^V8>p
zbXDc!e!znA-DL85sp6PN>Ni_S5%sz4-mG>fcT?f()Ih+kYVnl*igh=o^?hIW$pQ?`
z*G)`e9h+r|X$xG9jHS&h_z=S~)6qXAj3kub@8530t~W5CQRh0ae5e6aaF}oLkG$Z%
ze2b-hE_*&|?4{0OO`y`jZ9F((z>MoL>_pW2JK8ZcxcxY#ViIF>CA>)Y+GlU21om(l
z!WMI7nbB&Md63-?dkGWyY)9<ckSc$jGOlODE?C5tM13k;`Pf@_$!W^JCeh*dgF8x~
zF7`xt@ms7NoQs}w-2>LhMNROfkd$s*IJ@m&)IXKXOCG;@x1hNo+Owg+#+`3t*ZCP=
zRFH%r!w%^MWtF*azvUxz2FCp^4a6;_x<J;cF+{y}Q<qpfoC<sWz*nCRw4pb>O1~1^
z30m2oOYV-`ksqb^P=)o-iB>!zTnbLuoo6c83YlKA7^Mjk5b;U%jA3qlSNPuMPJ0Td
z&SBJyKBLgv#!)7PZWK}u=Pg~+fS1hL9pP7gW6tCceGIJm<3DWJvu=><;%xnv)WQ9T
z32ggQC_Dl&HbUh!LSkyv^+o%hnBZ&1xCwSFEpWj|P9DE9*8KkGL@NO$o3}0a1P2k)
zi$L+sY{9o}ez!R9AFq!lV^J&LF-`WeJB_dJ9N+L!Jk{f6_^EgO%77fw2d5-BZZ;lK
zfYiAJeFQvh6;xsJ8DPjPN*C1&tUVt%z5Q2A8^xl2SR3218E(LK5J~CG32&X#$J(ck
zYd68ASy+m-7?72ZZ_-9#V>)569k!;nUdMp`dctRe<$H|dBYRghNW~uJE|D~vWur~P
za-K<6msp%F#JFVzxX-rzoa1*ZFWAw~6D{6j;hR+-sp;=!xH@mwcAkNr2fx%raX}i@
zkh9rvh41>C=G9rFzS0Lj)s9UsL@irwY3G6$urBOPHo6RMD>u0uFJhOFF2AlbKWWk;
zKTKleE<)@I)hp5$udwB)jGs&83~A~|mjmt#1Sy0@q(}s>)2Pp~fRrX?M&(vZ*lXH%
zWO_N@Vl^pCb2;wf=&oC3L*lpLgD$xyeaj9#RdgW!)yU_drkE$!NDza*l%W;q#+CyC
zaJ`t;UXFR{f7Xo79_3zU3e#4*!}c|vBBWV#P3E~c0-s#z8q%P-Z<GJr-_xqjb=!{h
z`yVhkzwvYA0F!jp&uqxC<Yb`PMeY7=c-o1urjnL`i||W6fcP$ERp^k8d}>8kz){D~
zx$*pp!NIksw5{1RRA}V5Gm$cKu{v#$jg!(O|B;q*S(^Pir6JuY##-e&^Q=S(Qpi#w
z21OR*=KwS>L}&N!dBV`+r)`hm$eMHC{#Sh3z*&pZ>?#VjR?)6|w7MKepC09EZa7pp
zb^hZB+tFDu!69%i6JbGvuY-hxi0mIaoXoCM8DANY4Q&hCREx@(afc>)b>=B#{R;Xn
z4R%L(hF4g8qpMJe@%Xcr2SNQw&fupfFw;^WWHR2f^Ii#ZHJ}}EL%2D!vx4d;RmSs5
z{$uF_7do!b71TvHH$27xdx+kPE9c^kk?yC_l(1-xi|Ar)K6^W%Q?!TfM?6foaZ^89
zO&GaoxY}kD*VvFuHxZmYXBen@nVbMOp0|%pzPyQU=fGDL<`bzz9dDs%Pocsbo0qjr
z6vw$|VcBljxB7-Bz-B|QYr=gt8d{GGuWL5B{ekX3?`tgfRyfkhY}_&`rJbblYUrIn
z#b1V>!!jVEE?n~qwkAFod4!eJ`%hJ)-X}<U7qW7{yD+tt#Do*?V{}2aQWqs9ro2~`
zF$6s!W3ZRlIRUH31Es;E9@s|+LI2Z?jfMVF1cS9nbcX%Xe%ZkwvWZCbbt-w6z-0jZ
zcUkBl0HQ_bq{`$`vbJNDkqL!6(X)SfTOBs8=S3mt`@^hbYcG}b8VtOj$jQ67ac!S(
z?i-yUsHDBUaL>hy(;vn>JvF&5J$zw5n^AUvBAUOp-L>E8>hD<I6)%9}fu<Cwes*qB
zK0=W+WT!G5t<x@pFS>tsRu9@EDqU0ey@#Gly;3`!QIVa0GPG4m_5HzoA<?;RmgA=&
zr0GY`P14he<-i#5V)g2_d=bl_1MdpW&Y|-s_XkLjjfwx5WL2e6S$kner1Q{lxVXz)
zr!7xDP3^|js!VDx3EmB11EgBRe3i(q(KdM1f2~w|U0qfS$<Fst#pK?xIo&Lp{r)yh
zd7j)ig`In9R>RWCIL})ie~(C_nB#4${I5BJtoVD9o}#>A@0ve(D)O`CNtos{1{F}^
z+FFsFU&w#*oSDC87_^c2py1`$1nT$aR_$0P)iW_rQ(e2oxx`<-36}A$vvv<tRsD14
zA)Ri($!J;biy10!@!$~KW0`a4!=pdPW?71#ylmXUR`G1;zV+@!O?)6Ell^)<H0L+1
z5|l;ORnwsAU)R|EF2efP{m)`&@)NsH8T8CBZowP-zW9}NC5e0KZ`BlZ<%Yz%p~w(#
z!>85T9OzBpK%H^eKheo5W<*taJJ*4ZZU!BoRw8`b1-9p#zx3w~WtPvLB<b8vq!zpR
zJz9I+pTmK_Ad*kfz<_5!(!gNNy$W1K{1W@h1r=}<SqH9j4!@Zc^ojCZS1y5@&pXbN
zOY7aawTN_avbUyDNWG23m)XCC#4jgUBDul0ixB~!<}{|6jnX-kBl1P)nVf&tL@#EI
ziak4yXue(fZ2V%)|L;-vX-CW0;g(@PII=W*CAccoi8sropHiahS~p4p`_Htyw7N7G
zrBbE;F^V|HZ9vq(Ai}WX9d%so`$0g>%l&yB$zMN#|F5AVwto=~%&84P-2(mNLMCmI
zm6x9vB9&1AsODEE3a0wVh33$9bbu+|U!-jcQTO>cKa@{Tv^afkj>M<C`<>aAhL*M3
z&`q=h-QG>OM1ogd><%yfBG7xEDhy&5oFJn3ICGg8PRJwm&u-Xh^z}-9ltuBYXUh|G
zcv<w#{uehiKtDuMbxI)WmMIRmfSFU%Wh}bX-J>~}aY5u<e9ZfkXoVH>k9zPmxOEH$
z9~Jl0RC?S1;mW6VJwz^gY6qci<vh93#bKy?RD~)(@B|yrl=+f&1)-YRWyP)znuX~^
z=K~rKa2Q-x;%GHpHAE_xKm6briGS?rP~)QX&gChNS9ln%RbV^i<q+CAjVjRmBkYcV
zc~#r5qbfE;HUi;-C!=tCiRALOX(Jdeg;ze)y_m5o!kYhniJy<B>r`$Q4(M92=asBo
zJr)NXd&=VFFx`R^`UO@lk$7?Z5M)GFOF;R>{y;5m;pJ@n9l6y2tsm2s?x>QmbUnZx
z#mO$4qm60t@WiP8ViADySH9#Pb#3|dON<!e41oNx22m2?1rXnDU^pUJ1wwK1!pq_Q
zOAQ_ziBmv4yoKBU{|`a7T5e904XUHoE@|E&8PXra5bHH|?dUd~?&8T9;vwrZ&rmA^
z<+h|tLzE;&uBQ!w4$x0*cb5=`IGx3-A#ef>K0E8nyV-l!oT2Z6)Siuk|35&`;wIff
zqMqU`I-bL=C8$d~nw#)HA>2aN(>g%_IOWHse*X;qo%269{9gkg*%Ha%eY(<i|F1kS
zG^?0od^kXK6+;xs9wrhfj1_cQIeo}RWj~S+SXyMmHCJbE;o)7f{WnRJscOP-theb?
zmB%8{2_jWo^$&YrCvTuw<!zE4$dW#bR*t*}1YsVY3=KcT!#h$7Kp7G@K(+!<yszYI
zRf6fgq20gVAF|h`Oj10!J?3)viXlsw?9J?!ov<p<>zW;uLV?pDynj~U7GgX3v0Y%u
zGVpDg#?3jC@L>11I~D$WMGdixAWHK8eP)8DzVyv8$-SV<Nke~G3Q^TdNLi1h0De4<
zP=h=F1Ci<8dIGc(WOc3@B?tFki;;mvuFbXGA2(g`*OqHofm+<b%Q46MhbcJTAn*4M
z+FC-c{ImL%G>b=HKDalg@4LOH4A0_Qf)?WA<#hee)F&jo`lsUvl-7fVjO%9o3SX@q
zgxd1sJngGyYdkzRs{dwU+xi#%!=J)!Hm?@d_hLp;6Fz6RT6Vq}Yhl0(z~25ZRH30u
zsH;*IL*hZ3__a&N-0RlLrHbT1v@tvG2tNKV$pINO=B=sE<_LMdxmPXO&8ju|@=cxL
z;~hb9s{e1bOTJ>j+;ndTOPY7hN;zv9QJw7);;lk%{Wp=^jZe<;=?!a+N{@+n7!DD?
z@$klu|7ZPE$}a9@*A`DXEreqnhKC3bkB;P@6M@?R+l`l1wJnQdGM>S|B#o0B10G%u
z`Zf+cPT<mzCRrD(q^_xOX(c3Lvf?-#RoQ0O(I52UnF<QE6L1kUd6+Hcv1zC#d4LGQ
zlOe=k0!hV2jQHe>ZH|+il<proKHWmQj=D&<c%!+boI#`E)h;m>pT|<@tb5inU%L)`
z??zlU)8dVF;32`G-1_p;hZ50$R992w-c`<}Sh&BY<$b6azEA(G^H_p6%;mced_nn~
zn(E^@t=~FEO4E`-I_*vWR_0*H>qtlp*%6mZqZ^AzW_6L80cM;A!}AEw?NvJ(miw#d
zb7!@wj!@c{>@-b^9qDZHElW{~Ta4<WS@0`zLeM2-Fr!%KR?FoWI+Z;-O~iRMAn@T1
zr0hk?pr<{w2{7vzN%_}HKEMHkb+D94n?Ovti?P=_;H@jh+LOWNO4?<verUJ7=j8<2
z5gB9!ZuVXg^k{~}JU${%$2*A9mA>>Fe-X*p@oyl-Eh{27B{2s)-(HONAH)?KJ@lPy
zxD~NIdgS?Rk*Uoc>#ru|zWhiS_ed#-fpwKZfO@(a069JHWx!PybKC<^*d#Ux7G190
zEbw4{zC^m~Sl#C^hQuV}zIoXV9nQpu;%s+L)ZclX=|v^HKz*ya{t>M`CdEd5%(gRn
zC5uoI3&r_Ezk4`=FWhUunyz_&Ui+;As9Y#X%&Itt5d<<GN%F4DOLG2!7TOi#z%l%F
z;zBshxZgWa3NWepSuny_%yFh$-6aTEY9|3SFG06#A^$n?d1b{M=H#TDLZ0yrnMfU~
zu-Qju0z3M@P1%a->F4#+pa0@qz2rcC7vevuW%Gw&l>rwLNZrA?0f##{lTB~*`x;&A
zYaUcPR*W^pS?d28QWZ?S=Ji4hJb%Ujm70WF*dYD`JeD^MQL(Zt&0+c0par5lvGN~F
zvEZYf`>cm3Ph0>(g8RP=4en1H9?l7q{tf=goTOKY+R=26X}j3JGv5!Qypz0vHqHe7
z8z-Erb^`Tw8KxxZ>S)i8QB=%c9LCK#p8t)3a6BUPlht*D)IaNk(7hEWU_tT4FL=K|
z27_l)v}U6ZQ!q1<!UnXepp<>YfA5@Q*T`TobSE$(NDC-SA%PeXCw}oxvtW4l#vPSw
zT6gIF>ZMpF*)tB_6%THl%*6f)C)`|+Op}^-6f>XAtFMMYkqlUGl$3Cn)GJ>DF#U21
zVDLUIaw=Q7EH`+ApymbWLjA?y3q;<u5&cG<bbPiUS=IyMa?X38g_Ko&9<OBKl&?kX
zHBt-tT83)bwX6oy)%-Pm)pONGr25$ascCDQ{Qf2%<l1nk>ddnbRcp`r(mBg*sFuYa
z;pCJ>^_O8{)hJvkPe?)0-9Nwk`^`_Hk2EjqVnEA3;`=8a=U386=lR;-nW)GtiWE{%
zmya5^ee%{wc9QIQ;C-@OW<{UE=q~_Q5w+yxMcRtNyXUf|{gppT_^INupFHq>aN?9h
zyqsb=+V)VVjD6jpmXky5L$SK=XwnzQ>T^9={Q!g4j4b`#zeBdBS<8aSLu6_>YYl5u
z^X1mwmUrlBXxsg6eosp>Q98M9M&Ytxc$b2UB2kz3#xJdV?Y&x9tIfb%uxJRla^7P%
z;2}-}+{|!_!MjrfS5WkE)?P-^c+ib-RW%y1+bQTb$g!#&J{1I9<6g2XS5h3be)ikm
z+qg>@f{=*SoSL(i3705LWb1w#$&mWmb?Q^dhi_96EfHGigOovKyFK03lbw1R&jYDA
zQ*CJ2{ZySY>ezHYtabwKT6lz@Va(G+A!=@G#AhF41pJs}D7$o;-P=S)^cr)}NeD%i
zs=3^#X!hFAdHQ+k1de}s_gUU4*0D2SXtAQ9=I3wtY3*Y4cFKaiP$V;x+!;Bc#S))4
zt*TFPVW)}*DsD-mQ7-u@<+P;GSvkWjw|=vKXZz5|8fK~V+ua0rF9Qkh?Pr8|c(^Fv
z?+wfif~>tU1^`X8)Fq2duanFPxh#JBi^?n4ACSF0X!Ne;PuMVW>`91Tq(>cP{7&yt
zzFVM1yQGk>v6)qRZ8J$<gKML|*;b)pGkY=67g1(wigncXHZ~u<+*vQZCTrEqlRLL8
z^PkLpnLjM#{1JR}!Wl6vT4!1rD`T8L|NCmJpS9Y3Wva@RE%NazvPP_v$uZXdI>Dqk
zmp$HMF83Knu58mudJkQ1_5C2)C4LU1)|2S9d{jY+cIo%)p2N`U*S8^{i_QD^zRgrm
zPqjXbQLrqdBp#UU!~l)4(@k_)J?K$h=-Jh`V6y9gT#jR1ViksLY6RjpzD)C9Ll88w
z3oPiF_Kj{iMwdR{U1rD<e^acUUtp>_IamirAHmMeug`rvFw;7rXMC&Y09{NSs!uS5
zI78Z@C6iTtlqmS-dpZFE6fAVompO<kWWJZT<1RgbN-%o?Z(IL;R-h*vTNGwe4eRqA
zoXsVV&!%erG;$X9sg*r%`q;wI15{&Xu4J_7iyUQAynT4D5t}xe{JSMKjuh4B&L4ZJ
z9;fx-Dg#<+(25ERGiUNGUXiqmS1o@F)#ci<-ijiO!N~#{zHcS=Y+H@+hfbe(0_u@a
zZy^z-mG;!2&4b>t{`M|>d`Ie8fEhT^@p;py@*ZgKEj%Fpze$hMIKC#~nuSaGbXnwY
zt2F;9d)F8#6>EHXA`H2E*Ba<|AyqSWXjy4|nJp!Ae%4ZkV(x87ovcMYpa!i^T9kRx
z896OSa=jz@tVi64tp&uP>=@F-U3>a&J#EI%y3JgA5dWPr`9=f42Ra9N_J)KuEJf~w
zOAv1*OQ5zi5_7OO>s=IzAL)_#Ev9Aal0N82*^++Z6q@pq#6h70Y0NdUY@MkqyJh#g
zqcu@B1w#qCII3CNpz%3V)nbD1(3Fk(fy;;};tQ_EYUAU$TO>(PJg^t3M3EoI+|+a6
z_*7okpFF>UKKb=amRw)0Uncn{$XmDCyYK!({yqkwTe0kV_N_l3kj6K%=Ps?n(0A2^
z_DDf}QNLnh@Td8q{^*0s#1Cz&^sNnwK!$$aVUDsE+k-JfUIsNyEp>}wJ7~qrk_wO`
zW}3;TxKY1<j6yLvCv~-1yF00Ne+3xd?hJ7Cf2Kk=zy-Y8H&xL=?FrK>u%0$yOVof5
zC->=_Jbd11{HzDvGB~xC-8E+brchI!_noIG?BKU@qdtE4Wk1*!qhtR*Jj(CjdXOnN
z__Dg`;is$-^u04zTAqr9BwdeRzg*DAa&AB^ml8XI!TS$;754FcdKIEa52Tn`Doo94
zAS6V1JoJBaLN`@p{*dq1$QuB*#>eYjOm79aM2FMM<m2EL6tI9^8{L10R0FMnW*I?<
zQ64||;8sgJ|I1NVzf<)!s&|tV*8@9Jk_;^#oJuo9TrKcQLlMNDy1$Q5GxSvI6U%oz
z9mv%kmm5(yc2759x$647cf<qqV-sVOrszBjR~Q6eUWQV{pqOywvTc8caw9M`C3;T(
zwJblbZ8@BJSm9IczC0cT_5h1$Xt3lLc;bmf`}%x|y~cAlaNGHjeVjr9pO)mCg7y;P
zNfkeVQ|e_4R*BT46Og`84edLEJhYq?U5?R`eS8^&g!GGROV)u#)AB>vx27-$>j<HP
zFM1wp+|{T;)|}~B6c}?*oY=6V08=F)Azr_nWZGEIuDGyf%45H`^GE%yx&?Ow<Vj6c
zjH$E3#>6rkP&`w2+*<B+Afr<G>oE>i3F(Pd`l^KMq7Qp1kDvT_^pWGn|0mumxxpGx
zH!2^fPms%qHuh+TGP10B4V5eE%{og5n&WKhmF==>w7GMn45ud_yy55<4O>+DEB49Z
z69M_{WNoP<xbAqaY>bg`wUwVhmrF4udhPsOLier)$7XKeEtmAW$ogoaX$WXIcL_h_
z35tm8x9aqa{-KiF`n}uTw#33TKec1t9}1a&90e+l_pkYceAq?=-whyA<XS>hxX$oa
zGA6EKcw)XVzxFXXF_qrUge%z2`TkyQwXnmPu7{)V%Z1bQRPRmwXqNxV{7&IhcHJd=
zC5!+Fca6wYMM~<bW=qt>3~m5-L!{;XTU@&8y5<wUou&EjI*D?NYeGrjSb))7l|S`R
z9lrdN(1@}b1}+gTqi{KakjNhA$m7V|IWATSK_Pql%7Wvx{^ib~8~+_Vr_a`fC{P7+
z2illkamGy7MOtISP#RO`F*;tx^UP=2g2~<1#!nI<(k#&%o*m~#Ap15Ngen$<*e}nr
zv8>H|>k{nWE>L&7w`A+Nbb67(GJQ@moGs+PDMLoihyc(@QO{Pcd?|aK4q9+^=??s<
zd2ac=U7q5$7tl0G?kU+1#fbZ@KLG=oWd37mXGl@J03lp;#Xu#+6%v`Jnw-dl64Eah
z*&Yvqy}UzUZN~IQ!Kx_g5FD-XuIA^sz5m`M!vn5;4KpUIsArkkRG|V8`~8)l@!!{;
zl^{(dMfJ)$oNsD2I?&j-h*w33s)@bAW;J^e^^aD+GY(Wg{3*D`Rn`lSNvn?26lbG%
z#9s>Plzl8sA|=G5iy>MC-Xb5#$|=YSaMTX^`ZQdsV6YKf^gdzuc!mT*$Hlq-l%7?P
z^t9%QC;vgo!d=S25o+<X8Jmr<!?){tHty(I2tvffue757Z_`6Px1*^;vKh+fZlv`-
zHe;*cwLe!W7<&R+A6Q6%r;1mAe5z67C$P4g#)99Lrzj%z_VELZ|2zSaw4r6)yfClN
zU2X+96s+0sX<T^CXL&=|XuMv*@9&z>6_i-H;Td+>#lkJDZ5|D+gr37sY9%+!vm3Vt
ze((=w$ihLM^XUK`2Rh+`(G29$Skf{gWaFw<fk=p!tB<2{u;lIM0OjZBq|8gS08jqs
zxAwyQ<E@AX-bX#!e@6a&19xxoU5?%4J0ww%4C$RGFPJtrAyQ4etUElLkuR+*WU+1e
zxY!ygFa|nDY5S^fL-{NYew|Sq59MdWuKediB6J<2?@vmLAy44WaedERo;m!QI~ge5
zd&rs$iyMCveEXWUyG&c@TY>=|sfc6E0?8I-MtOAZ?y`BOoT}QAr;m5s_<E|K4{1|J
z3ygrHxUS!gv3{)ENjH-Ey_bx#ht(FB#aX~q!5Cw?=TCY>db8^GzTjS<WWuykBXUWJ
zADLMyrtZS|KL6QVweRVPW$bJ;CGK-3Zp2eEu@+AQSFY}ho^J49acO@GTcw|}&Ukd@
zM9|*d^t_Lx0m=DP(d1u;9LZYA^LP~1nwa@HK2WYACh_sGs)oy>_Y1y1q-WqD(yd`Y
zD)XAZUaDzc@mYG$VFRKpo}8#%ifKvJHo}})DZg=B0Ax2Vvbe7xnTJJKA5n#VPN4Tw
zXCVFc30^a_$?tmkc!IWVxVkUreVQbHWd&t{cVd2iPEec_DW+*jJkONhTC-1V=i1RE
zIypyhyu_TDFx;v#JUO4m`f8r-X;Lpgb_|>X_D@Z;D(djy0uF&rFP&*k0abJ_OAUOP
zJ-0AM&5KbRE2*S!bc1zRBBpHk*O@A5mOcF}A1lh#vX229N#VsoypK{sIMDv=umF17
zBIENJ@>L)MjtoSW2LdaFy>Dz3V=P^*hV)wCmwI@c<MaOb!EBrUc&p)5zAH}*@hq5_
z2oE|&zkMVqo+<7+sB<tnp8^{A<eHkr^O#N2LFTR7=NwYRwYt9*F+A-l4OyOc?#>t#
z_7HF6))Glgdc_vd<_rCIFVNLBE+Hd&^f2|c%SX3V`oa@-D9_JQ<7QY`Q1Y^{@5$w@
zXSg`Q?CDRU3Hl}IhphR!W4*Ped~bQdZ<zxIvh80q67ZJ#f2mR?kkb2I#b$3zB%qkz
zcHh~*@!!T<6<7u4OuKO-&9f<YbwXXQW?GwLU@22muZ2RjAaCPxJ7mMvKkG)*@x9;=
zS!3KliXIN2i^-~%2AYh$EezG3B8Irj1oQ+Rq%HA(<$rK{lR~gS!DzJMNL`Yg6}kVs
zc&+5l<ufPgG25n0)N3`B7WR8lPcxQxExlC}zXh{UxESPXeYF-G($B}`!}*d;*oVZF
zlb@;({nF<faPs}=&3+DzJ!j^qOXlA#Il5fyUFY}-W>&<zqD8v#5F#Qs<90O2PoDHO
zBA5p0w&n1yT@|#7PHgA-U`pR{8WWZ!N!7M<CBAl5jGlb59OIoH??b;0GP=7$2E+(3
z@T`h2RHH_V#?2J2u|U{yyw972Zul3Gc;*6MTFP;AW>#h=Gbnw1<URvg!E~86)fwp>
zo)@c_$$g?16^vt5@2@QgRtDAzU^)~M-<V!$yxB+;aF1Hu^roDhsR+&<R?2VfSCr$Z
zC6#NIu^=_dTiq-O4Y2y@1p?zPBG?#Dq6472>~KI)O@+-?$I^16MYp@k{F8G5MK9{_
zW<1|m?&pEbrv9{Xm`^4&DTI8royPFI9Am*l`jRcRc<B8Q;wMF?)w>ExCz%H^-lu1A
zy<Ma<x%VyP@r03Sp0i!6Bvfv|5b8L~4;G(z!4siwr-A=PU8(al{gy*!iR+^4(1=S$
z-}ggPEG{CGb`wE@WhD#@TE&r5GYQ>`E=nIfQr<nUCHmssv1Zv6vNN&q;#~pD!}rnj
z8QDu0S-0A~@sP?nFEc4`_kJsBwtGFAbI|ihF#55038Qy?PMF3HZ#eVx#IXx;#V(S%
zu0y_WVHIe?$o?V+Vv<$F#6G%6P<bwah?O@HKnTydJlv^V#kk_?FeYL@QEKMy4ST~L
z1=vwx1_>iATm)8<C10WvAoKXENqHm*QTmj$#deN~WS$(_STD!*HoRAYk9A9{Z;GAs
zy}`3yC*Q#7ntavOw|oH)vMA*DCGm6CL^Kihdiqh`cMZC{jaILusc*$*+xj5VC_df<
z<R@%+S0kLon(7soP%mUxogDyK9!~cR;%a}*n7=FP2oL{?dmT%Jor=w=gJr7x=c>a`
zE6o?*?-sO%Wq-$ur%)t2ByhMZ`SOmAt>L@8O0Sv3nO^72lRIP*->oE=M#)4J$+CI#
zp%IjNHo(U<(S6`>(f1|x*?8~mP2_yUnZd#rM;Ryvlgk5@(+c;wTwrTN-uNn@<pM*A
zfIwxJK|<Op+bo78fMHG^Tz%pv@k~C*W*sj`rXB#k{K0|t9iomWdz>><O7T}c{z^rm
z<G~kFI%kWQE?KMFPKj{Ch939n%(VNB0vc&KYsM1sEJ`zn@;V#a21A;%yb&6lW>XEB
zu{|Vpzef1Pt;|{u-D8tpKdVqBW(@u^k%MU2DRpg9I?T2O20ly-H6;tJ=@TC_v7Pgw
zR3%?YC}E(i2-DmsnEUnnuEV4LSmJn&#{!Azv5~{~qBUqVVa)e5NC}kQ*zn;ye|Jls
zA4^Vh=8S6>#Wx=n{w$IksXN$3@#6>Fpt!n^HH5Km*Qb$3Gt`W3|26po!7m*<pI<Z$
zWQ}NW6&5BPQ&XVtNo-45xXw1bI#?2l0ZH~_oFR-K(qFs&=_7L#cFRZPu~U49f=uhp
zCs`?uJlv4N)sSCihjA@TEZSE|=UY*h=`U>JSMQAPT0RjF6ympehn<pA9B(yyLBTfw
z4IvPdSOwleY{#v>VSRVv*rEPOhxv!1pu_xPn9ELk?sIlQD^9xaZy)6{sx0n1%(Ke9
zyK_c7Mf~vM!QkRbM)j+ntVbWB$XWLKN}4)+<ZGsE6>Q~qUjNwo)!{S6_$tWpeOMis
zK55M5N59WpfVxPJuf+Fw&xzA*Jn2Ji?cX}$j8vi~XBT+w&RSYsGh*9`Z%D_A_g0nj
z3r@sEwCXI?45Mi{vOYYilyc>KJGsK@TX%m$e);HEfSZjUSx;YIkgA$A>#LUYRP`iU
znf<L{wmTsra85SWEZ$>e42q*M(^c&Cx5m_2@-cYqAV)}$tB80Ms&&o?{zg6tMa20x
zMAp!7@Z`4*NtAi@65z^TGM*b8Z%*hLLzn(|R`~V8Uk<%m1@5vN`K|u=s8T7j!ojkB
zt2J0lk$R*j?=6FF?Dq2c*PxjTlDCef*uuw!<LTLfvgTc}<sX|4hUztZ^kqPjvX3hs
zC~RC)2lcoybxvCoz3cI7xbL8lMjBv7(7{bLGb^HG|2d^M=V6BDu?mgdvLJB{rG|p0
z1|`0_1V7J1O~hLaaT<kUskK7du*XCs@P)f&4xi3psWbJA?x~RKqymG0T<N1tqfAud
zo6WZtzrQ6azZs0+8PT_YGakgL|E5n-H>vAa)i=+GTur7{vinw+nOh!8Dje%mR4nq3
zBE#KN^cA_wm8NpuR9mJEp=b(*u;4{nBYPw=U4#qfC3IJ_7=KtKliyO+_;_~_!_ym5
zJZ{Ddc;z_EbLF!iceCe{o>w4rE4zfiCjKGG6!W+JVv;>k*N>HbjOBklmP9+6eR12^
zXeW|NKJr1~WfbIp0Xjj&zE#O0mBHh}y&u}LOs3t9`f=f^?g1Q%3|3U~ZXx8dNO^>k
zsdF|Wp_)07B=jRgQ0O4IxrUWhgpv%I7K3bRFR(ZZtw@hlpj<adPtb&UUz6p8F??2>
zELhGO+)Z2RID;FfiJ*~T%H#lym1^v99Gz9H{^xQ{FKs{{y~qjXLq@jIh2fYgL8I!a
zLy=0;x*8NT=*LH@djz1^!nCvAb;^M&h~RONiZ~T#LlI0B+r$tuB4kRGJY4Cjv>P_o
zih>w6D~=A7xvHI}opr2{rFAm>_;6XM0!1X|-3^?Br_^Nl+-?L;SdSachYaXMwj=k?
z_}~2}3JPUieoQo6nvIwCjtk*pS-uO8E8udur;i`KXyIf2E{G37Kk(t=iVa7L)*j8P
zGMY_R=~>_zV@f1Rsm3w0TjcB!nO6)>*C^y_1w_U`ivafvynyT*HZz*m${nS4!`51Y
zz{6(8l1kcLVQ<_}hjbV`DT>z7&I((@ni^%aGH6@`siw?(>a9mjxRStFQ9Dqt?H7#P
zU%#pf<@CCN!Jc>co@RcyrwyI-2hnoSV9z#(&Wx1>$t3r0s^v@!E+>PAg~%dRq99NK
zv@{)jt+Q!MZSc4V6k%YZ)HV<&J@4+NzJZwF;tU&Vm<A^7%2=6R?yPheH`nq`p3+hw
zQ({m#9jB~KYip#z#I$tSlxRvzI|?jKTk7S}iqNSss37J;P1ZvOv?EzJm3~Y(sUVHp
z>ue{@7=^2nG{KifqMaGGHK1=1<oQM!y4daqn%P%Ui>@w)<;JAO_`58?#be{~!e91g
z<yAN2S2q}K4jCoQ9KZ>J!^Nl-^t1sz$7LpuOpEUx8H~YDLBM$d_X`X!i&2M8jKC<u
za<s|1&w$BrMYM_vqERx<85l*ZN<TRY#n-kIrlxK6nsja8h)_~SG_J397TIx`R6QR$
zEgGfQOp~i&Rh4dNh$dCXyEyAnqxT)Y*J{!{pEXXV!5umyS`nib7{Rcu&Q)xcn&s1k
zDB$Y!fWSf0JACA3TdpN+S~SYhnOdj4*e2b_g6F6RQDj7CnO%xkX2p<7%7zm3g9c0x
zWTDEC=~2kmrhSdp<0cex$Rd=%lOu78Ksmkjh*6Unq#qN8Vja`227A601p%5~I^D2P
z^xdX)HLgmB$cnmQQd<Px#T&QQx+`5c)t=h=0nNx}KY<c{%;wnl{(`|h;{074;G$B|
zWIR?{yXko0-uzR?Pt~G(ROP7%V>r(4WCo|i_D_uIpBz0QvwKWPpuY<Xv{S(S$aCs6
zt$tLPAd03vP4-+fNf9Vh_cr1aMP8Y#MJYI1(}*^B+^9+n(2otn6_jOvllf31F5=1_
zn&1hMD6e<qTTpPI>ldgU7{W8W)HtBpf%L3O(I}$Tf{VADFbZZ)J1AH;2*u?XQ|~hG
zYh*1_Up{%b0xhI;GS131ZK`IQn9yla@^F=_+|jVET5zybkX+MKM+MU8TejmSb*e6K
zY#2ID(~f$3o&_UuO@=OLs1!OhtgdoZJ5aEqN=HjVc$d($p~h8a4;mXDkP*n57{lf|
zw#kJPxOQl0KrbC{=W3Uhi@bpGV$k?-b&}Rq?=o(z;hp^Xhc5*8`;G8hp67VMV09dR
z<KBMI=BrD>3qrlwo>gAEJ-c|xmLmqclOm+%IcPb{3u-wX6Re934!mu`pt0#m8l~Ld
z<ptUoUEKZNA@+2G^Z^+`oSkc2Q^gq>%#-XAo=h7P@SxFQ>K*}TgZ0bHnI<=-A%jLo
zC}LE+n@2)LXF!>xp%fHrb{jU=v-K`nlrm&e6hRYAgA19P($R`|4ME|K0*hfs11cIx
zllrNVa;blv)S_Nwc&Tx}f<U4bSi!Wr-cevjB@5`U*GXLoFh{A4GcfXKrBnrgVTP@B
z)|1Ad5n%!S^rDMLUU%0x(el8NVJJA@T)bgT71QX(2t06Ps3uj%ni+IlNbsT4V&xGE
z^z#`u)bS3M(9j_hBV{3SSB<0Lw@PX305ugnB}x`3bC%c)TWh7a3>dugZ<m1kfwOTk
z%(^25>yH)e%{yhVIZbva;hAwl5Cs`U(F8dvJ@K0186(mX0@Vt%#NXuw{7;v5zks1z
zgHX`=F(L98HPgg4uBs43DcQ$K661NPsvg6_E{~!0GTzMAuc;9o95NIYtOy*Umq#fD
zUL;6qkQ`T)!*bZj)j2T*g|mV(RU6Pdh*A?M9Tzz1{edmlY(3b7_J*V68d+9BXa@(&
zV^k<az)5u41&k!v8khOF(Ou~xaH$m!P6)ce;fe$$>8Vyg+YwmIw!hAfA_I!hj|o#H
zs@-+&#x>O_f<f^_(1b`uq}*NQZro6Xc0|%*(6~rNjKW!LYus2zDe%y%<0&O!&$F8L
zHj1o32g^gIMp6pWe9UM)U=(po9z+FCk3$a(%L$A1s6m=lfWq*97BaZsZ$!*{p5-L>
z=e+k|T1>dVivzS5%y#FNtl||1a`LMhbINNukNZ&=z;K*{VIp+eD~F_y$w<nGi|8H|
z;_t!&{-le$UuuM};$_`}WpX#?5M1?w47WQ4t&+011r;gsIL?z%dZo<abgG5=dR|mw
zyg-O9j?x80IvF~c7Hk|#xT+aiu)8U}5;wTTK!UYXfiw}Mm6H}1A51#x?F}@Eb0o>S
zF`ZIqbc(?;!9s?pi2x;y+uWjF#?)Eslmz1-WH^VTLYv)Okb*FYL3)`Mm+oCwgPZGZ
zMwLQl=LrQm;W7p5uqlc9S{J5O$XsNE2BTzz)nu(#DoG<N3sQ!Pn4Dmm2+YucimHo}
zip|zrF^7v4)Cz?~jEt7!W=B22+gJ*BvI@N}%<d*ag!(4lAqcn}b2>RU`A;MKfcq~V
zn@<#08Z4A2nWHK4?oU_pf@pU$&s;SwF(R1d(ceUxB(EBlNw)8Spn;1+US-|E;;J3l
zrJGL{mev_nav93^(M957ICK&9j*lGEE1`c<?D*bEI#v66b^{lOF71BFUbwtATS1%f
zswzyOz(RtEQ)NPGPa-czi0MW+Y?#|A1V>_dc|uxGs;X273C0Kt=~jqO#<FvS#3Z7=
z9#hbmR)^PAV#z(Qf_$u7H_T!Yos1A1LKGKcDLt@Lr|_t7X*3#}4U37!ii)uA$yj+6
zuhI}GyRUD=lH!TNBCJOe-dHb^6rt0Jl@(a`9(ZBC5EhGBje?6O&=yLH#PoEsxIhRA
z!x&C%GU7=|L~*W=oJv$x2($uIsj=z`tXD50FH49`zzimgVTH&@s-hUJgXQN7q2UBB
z;x-c&+fB^R6S^f4l{J_?0GG?K`Z_T_5zEdO65|PT6HZdNF2IT>Fd+$qH2m-1o;`5?
z#bVj+6SsZxE9sp)n7}w*z@?$3MHHBMhk~-Y+}Drp_sQ)up@BEh48wBOM(fG4n%@rR
z{j}>CD+r#c18~oj)0j>riwFw1G_(8UeyO9=6a8IS;G)vS-7laccUqM>+t|=VoN*F%
z0~?VdFFWjxA4u002x_fBDP+b|?wH=v!l(?|RE-?P>Z0V;S!f-3`EfS18}70bZYvv-
zD6c&3j_WV4&gOKX7)DXn8h2DLdD#(nOa|R}N|38WrH-yIa>n&lmK{PnLc7BSmJN!P
zSDtXk_m@|mWc9H)#}T$hHln+{?5I1wFI`_KDD@&Gqm314JMxl4?zrBRu~H-mF(8bt
z&hZrEC=-@UI8B7x#)c#-Di6E54WesugaDNI6STQn`XVZSOs1z8a#5kD8;aa<ebE=$
z@MPTDM4%sD7b~y!6r=nk8xoDP9BFHGq+f!0XB_|AlML>^m<)UD!xQB-3eWHcQu^<m
zV_NzRFisFqR=4Ky8~Z26`n#AwJH^E?D-Y!TdLZj~S&hlzG}~MhCEcHdG+F==9UL$+
zHSyX}y)$DYG%7i~J?ivyb@yZ5G3$xbxso)tj(Db>AC8@#hsV=D4?puEb~>>VMQPan
z)8CU;6g{);U}wH_`b*w*q)r2v$8Vk$Jqx7|MZ(j(0PRr}n!krhAD{7H^FBPo`l8Z$
z>nfx&dS(>z&hc}4f6`XPGv)Md<ec(*yDoS>7d`*;P95ke)gK?6U4EYUXCs6AFDk1K
z=gxfPn?SYV%nx)Lbl#k9_U3T|-kCR@Bp~Sr{soJ}wd!EbiUT>twM~W9jp(kE(ImS0
z+$_iOe7~gVF})JUXLiqsjfl_(_`9%xuju0L2igL-|Ds}cxTd|d<Y;NNQg+TY&GLM3
zfa=rRt{9gIlfDE0fg>fAS><&r4`lDmDQmPk)C!r5A~{}gF<hWV(LW)2Y-ZBH<k*p^
z-A+&I33MXB{XnY$_g`#&+I?dF2g~F%O`OT^(MfZ&{52zce+o7IK&!AgT(u_ap`xm9
zcOKoDQz8l?A-z7rIYA&WJXEI+)2b);Pn$EcPqH*zAkaww_XDj4+<&n#+MKt2^2^%8
zxk{Pb`{JMF_z<0X*~4@D#7FtNbU;T(tTWlZ+j(sBiK4>l27}G%WH^!}q;AawLE=QW
zu)t{p)8>xq8?Mu6RSJKX7HAXTexTKW`~ASL`?GHUcm*f$Bq2SNoeX>9m;oQ$I>X=P
z0y-6?b%qrOvbUfhpsuOB-b4^m&r(4UT`b4)!uZVO%lh;f+#@zMDlAy5@^@i@)&TAY
zS`E1051{AoCC@HCTwE!qXh9Uv53uFA+j_@G`ny~}C&cAu4;EJ*EUws*Q?~VFvBS-(
z<R}IpSdMovY<Qq%aQC>;y%Gi_#tcu3_jh4||3;^Ezla&P)>@7lNd*Z;xiE+rPD-O2
z;MAF`1>El!e%f{7){mFd6lHU|uN&3xv%6;byF@@I$Kr71S2gS_EdOrD@e^e=(l`^I
z7AKq_p#MY$288O=R}bxd^@z+Uz1H7_1zJR>bw4ID4Q|fFU?lWYy)fW7COCO}wuv>f
z-Z_52{eENk6CWNetqD{setY7k{xF_Da1q&GP`>(b-o|5vb!Mx<?h*un@C+72kt7NT
zGUFnyAJunEMq*TuP9cMd1KKa0)%}7fct$^ghR+ug-o+c%SG#LngiQMN!To+?XF=s9
zPk;RI^f52Z8Sn4109}+r#g)JA%i5b?eyYJ(W3o~Nf%MPuf`ehur`L_{JFG{yfyvRS
zF=0xX%-@9t{!C|eKi~^EJ8Rrn>#p;Y```WYyULn+?_gwI8Ir*548P=H>FA{J%t$TE
z3;r&V-(Xm_PtVlrXHM~VLAoxrMr(FO?S^ATyYtHS7M!9fN=Zut2fJ9#;bwb829HQf
z7?#p)WLmc#(P92BEbs^5exT*}+5O$eO)oln5|hJB<f1g|vo=t0xiP05!@Ka?Er)aR
ztZ8e$eBa;Y=<<kSo!OpMUc2dd(TaoFry2|dPEwx!fh@<%WmJq_7apjYJF3sM!}}=Y
z&;Z;2fct@#<7fBxnKJwM$s9^9=NJa#IDelFf73kTbk~HYDVpXS&RNqZE&u2Zf0qV0
zhs`I8zTbIlU*0LJi)petNYDI}q9C;i(5UD>-J)g<OP|oYN03^nkU<>aAHe-U%V`(x
zN5V@>PmfQC=Rp5m6=eN_+`QtVB8=t0{p}WXLvK7@uzYV;PGxO=b)&`MR!BVr36|$=
zE+#HmH!d@IQooe+*sz|l5&kZ0mjm1nw4C<gez*JoUVY`Zo35}!)4;Au0FFKX#-|^>
z@fJps;QsbcQBBj)(y9Z66<e~3j}%rY6>^mBqlbr^<tUs;j0(9lJ!y1CLjT0*$RM4+
z3)@uy_X91by}1A3h0okPZ?4@Ad4Cs06F|N3>Y^_eeE{z7SX3D-x#e~1j^!`ibE3g)
zmr*26NOb`mF9Zjuf&<k3lA<T~OSy7rhQCYN`2qI>EeG5Wv<txfoq&}Ga(>!<{76Z)
z+3qwu-4r3sq>kelg%c<U7@wImb7=a2#AvNbP7+<cYmYDJvhME$oyqZw-qH))53~!w
z{hbDzllgUj*4C`zL&cR9jV2e%%V`Qd*xVfNbh`Q`$4u;<a!K!`?hzr~!h-x=*iJ<k
zbU)AYzyJRGhE1D!p#wLR76l<ZB>2Aj?$zloUIiJrA7~eV`#T>8iYl^B)$PkawdrJ0
zX`Ml>kjrUlq#zf|X%uwt_^7ezNh4C@ho{BMy8H+q7lbb8e!Ja%?cD1QtY0y*cMtx|
zuqYktB>+hf4i~fTMA`8ZCo(fGUey@5A7~eV`#UQv$De969WJT-Wv}!y!0KRVO6nkp
z5;b1nBZIUNftvn_G1rdlJuEfe--YeibV2vqY__ZCT%V5DzjoV9#?1*H;dG?@U&LfI
z*<i8tx#z95TX*;Aoq5sr4DJWo1>pWpkJJl*;g{|`@!htgr40tN-DP(%o(Y7ckpgIv
z3J=sw?w@kykj#{*kU+H(hHw9`bXoWJ#OhxE-%N*_L6?$@lxCsuce(xp1WlXFmh`*d
z+^}<h&$QI$<;b2sZ3F=B2igVT{w_tW$+G59{+c7X`PEIOwFaEPWE6>>7H*bvIo<tx
z#LgU&KBi}4VuU_22xbv%DP7k6X=2T*^Jg+F@8Z~P$4d)p8nBMj`VWOF4ww7IWn1rg
za6v+RH<sm4Je<-!`LfF{^XY!U{Xn|_+}{Opv;47=s?8^hkCs($J6U9MxRr7m<!vm_
zI~Z24Rt!vz8=BH>#-OzHI2bAD-|6D+N3tocGhX(8pK^j29;iND0nia3a9pjBIqWu$
z<7kqsZZN2#ld`k2WWJm+3)~O13&8zd8Jp8xT-&t%c)`Y_MSF|NMP86*4#07aM~Eo<
z*J~7G(vxo*(?2OP*x!Y1zjSf;qrVo_HjaAqqi-IXGb%0K-2q35MZ#2Tl&f|fduZw2
z{M;OcLgBl8f%}1W0k|K)JP#lg1gtrd_tUPERYtSb;bu4<1p(->L=uFIB4dNKH;(N;
zE+aW1G)S$4Il#^V++SR481d)_%O1IYR7R4^&G@@?tw9x_T($Gqz2EQ3%R%n<;S~<<
z2igVTe&A2+FD&1Dym)JNaY=1cmBB(&Bu$dMAUGM8<@ibcQl<|~>z^2%6d9t|sQg{n
zb^*BG>;4}eo;xZ%(Zzaf(s3&OF;DMAom&3uuHz5<xHm618}fdj^8@Y&{+33I{YYu`
z?wqoHg{L;3ETIToDMK*;;bu8E!$s+}1CpY~W^^Bup3tky&mIKsKimE23PA!=$z@bW
zdOsv$8pXD4hv$E_vnW5$r+WhTUsMPj_jYRX4CN#W?Jp+9ah#M&h;e5I%D6ay`+@V*
zXmu1-H~w}g_m}-y=nv?TPLU*f!k`P4B8eEiE-fZ(NJ{LrBl<=MchRl^?ZW*RcOvrh
z<^2q&l#|;}maaLF!|*~!=`o5f?A(gl&AHXL-F6#65RyX$F*Z7C?%Zo4Bcm?hj==q`
zr&*Ic|L-kI(d|GIgw^4+I$UUdlEed4$^fOJT{T8UN91N0gT>B^VpMR@#T+67+z+%6
zK@f5)8@}6l^w)h^PL?y+oG6TzLTV91|F2fiLz3ezAJS_~uY`~QwNmcGmeTDFU%21x
zd2=qOsq^o;qR~A6!{wLt?eW0W5$(qPtTrfM%Htnz%_<t$qZ`9?5V_z2O^`H6IGs++
zlP=m_%(05bAAbDtn%T22;C{gUtpt~T5?7<qivAEDtdmpJc@7ob3P|yJWtCt5zGLUn
zT*hXnl!~zf(r=%2S#ork_wH&B$Y^TMiTsD(`^sUq?OgJzQXvyAR%in52U@|d+_Ke&
za}E_%=T_D=SR4wPqDY+K1c#dm)v3p2BwaZqGc`IiF+8X%1OdKq|H2<PM(MS;P8fXt
zU2i>EJonun7tNnGYskeLnGX9`_v^Kz?|&yVA!^Zc^ROoC#VptbgQm>&O__JRw(#wb
zZodAy3%Fr$e=9)`nZtQSpZ>aumQfE~J1ssk^gJ)gX^JeZs(WVf(){x3QN7cWVk2`;
zRUODK9GI5;+Ff&LnrbI6|B?F-<rF==_$QS5Z(j6_Tuy=ef%Z?M#kM=Yd{<7%zQT&V
z`Q<1O_vWdds9Nk19oDB?#Du;*#`a2#=+q+xU0C<?f>_;Tn)2e33H?%DoqHL26wtJ^
znv3O}3?rv#m7H$g&L<o8zq@+Z_YYj#Cqa5IZMHiFObk+^?ch8wSY2+E%c_-sgkNrs
z69rKrqfwNGb`)@KmAsQ-k-5>QN*Rrvmw$EtxQE`057&S3@LaRW+EHHnpN)(pYYe6l
zk9_#@qK|L5Ztex#a+h;IO4Fsm#HFhM_a3kC4}SBkc6wFOL$jgLB9)Tbc`X0wPnV&m
z%-av#m=YJsa>769AFaQ1%iedsUo*US%G2|&(rY#7=DF|nFG_0b7tXtCWS_M2r~cl`
z1+;Ts5KwUl=?G4r-sUy~XB`*z*MlOY?+|F^KOL*LCc?XA4BQWNaB57Jk{ZJ&8xL$b
zSy*SbD<}$G`J!h|fdHi}CRo=mF?wvTq-#cGl4K{GJ?O%^f5n09=ay|Mt22dYl?ma&
zuU<c?PkhuT>-T=Y>x9MOl2hdMWBc7PVF=;b-6LPG-JMsu>hYU`1Js{wKJdlnLw8Ra
zHfLm?O(zQ9`E3{aEeIT&IVAo5DZ><=@oHZAdF$cQ%8a<EwMX(ycIVZ@G9H>fN~c!3
z8E(n8L*MN<Zg;y>GWzOay>Gc>fcl@ix_RB7K{mX4>lK)YmDDs2eR$D#56wB-{R!dv
z&lk+Kc;3<a`zQL($Vj55$vphA4_<lilk4Zsxqw^lTJEO^+{v;wyNi>m=tZPM6j}zT
zmB^ZBXWo)u3Bu`g+gvV&<1tT`kFGzBLcxnDoE6d64OaV!!m>rnen+wJ<JZsZ78%O0
zTx3Wf_Lot}(FMn{+=svX{_VP5cV0R1)*0hDf!C?YCq7)h?P%VDxl=D2+Q;?Jby40r
zc%#LRo??-~K`bv=91gdeMduc&Ixs*@Nl!7+&9dl*M9(;M1E648iw=q;yyiw<G+7*W
zr&Fm==ymF5ouk7<cbdUsqevo9tELDN+z)gDyb<o6{IVsR5AQB0mnIHyx<wq_nbNZ#
z#RCBf*`$7{(+8#v?h%JFPP((i0T<T&M!WO<HG39s+`sCHo6srh6&L>MuiHOZx991r
zFBzH=cf7Rr$?w-cJY(ctlZK&(&*T@rNR1Bv;MU9ET)E@REr*|;eaY3sGB=+ny65wi
z^TrOGG9azA!LVS-n%gH1xqI?3mz$mZ(icaHYi=CZZ}za>hl?wp`EkSIJFl8GG<`=-
z$yKj^`^@Z1CuDZto>TJF4;!Dq=F<6>4*I9^TdmgFbFb6Z=PkH;0{Vd~8_YMn^V6?S
z-aMjL!r!<bJ;`a3kS+v&|5Po8BB@i2#?cGjfBxN%Zk#*En}re=V#0zh<bH}GODbxA
z+_YzJc41XjJ;rgQTA3ObHECGiX(Riql?t!tys=?kS>-QV_wPE9*HGJp{u&&r@0XT1
z`;wu(dnCvxGP|^D&h1ZQVfvr|4LU}X#frIEjOTa%@RpqVn-};<u50%meQxpcjO1?b
zJaD64tE_6YJoxsKdZX#ZJLU}Rnauoi-j95|`$YZ&Z+#gR8u-y8xBj~0;HLe@ODgNo
zPY@FkdhLV}b0&@~tEvBC{q8MCvl<(W=m1jUV`fhnHg#A(+|x`nKt(<A;ZMJA*gbya
zfcGE1!E)v#qb=_}k$=alpQ80&yX(4`h+vW+!2Lic<EK5xcV?F%!4%Xs+MSG?mR<(9
zS>EYlVnTJZhGfhf)GInfn;a41?~=Ba3tr#<?9xp;vPw2TcPsjHWrK0}lb<|3Yt-!%
zhj~A}^V1c%6?H#9a)XEo!yf<O`qBN-)8^NMIg9U@)ju&BtuXb4FM|S9U*3I{#o<H`
z#d|;ht<hp%_Q-WEH#6|j59W;<@bv8QXtg?%ZQRqJJTQIK{P6>~oG6+lxqtk;@dE=C
z@*j5{3l7kX&rJTOjNR>aKmYvm?|=LWj}ek_1m13J-2Td)LsH`Y<bJEkhLpef_x-u$
zb^iXL{sRPwJDkk7+mDVKKe2zG-VDp4*tSP<;{5sZRjM`(YtdEQkL+9BXnf|=rAP8h
zk?+S1OjpS1vg(HI$8yY8$NjS>-!gra)gkR2-8hGGOW*lnb$&%{-|lg}65~+Jd!nTL
zWYMXJ;K2K@nKHU>FN4Lpa>t<tqj}rmY?34<4(_85)G#bRcj5@o7-WBjrpTg-nuVV&
zEvc%1c<zi#2W7nV<LY&Lj!YWb@2Q(-iQe=64<JJyKtBHXN4-`P9j4!NvY>bOgkYVz
z!Du;JP^wla@4aTy>Yay+Dy#dYB&rpPMziH`eyLh6fBl~8`=%!Fynx*Q!WS!k-nRd;
zA$?!D>*}-Je<-*3;dj649vAh@ysIKZgTVbjXQi;FabLly1BDgavPyF*>lHFe;b~{%
zWTdH6`X@yXOpYCumN+IOAyCy}dbj_H3-11_Ui(fLp!n>LS?JHp_nv(8>$N*xyggd4
z^M3m1H|q`-RxW>JZneQO>d8;iVnZ2@|Lo3Lz2l`SfAkC)^yvGsp@DJ1fh^A>@8(w3
zXU0W-c-!Uq)lCx@F8=lL8%KC%Z9Z63Ip>`p{-3?`0F0vA`uOhDo$b5nJ%JD)K<FJT
zbVLOe5K%y}Jb9L9ukh^p`SjVI4fH8io{A`_^xkVAjS|v(+1{qleD}_V2!aTbm=NX<
zESuTM%<P?+`#Wdux##@f%@b}I+RGne|MlbMZ+7M;*v!+1XWe?`0E#@@kU!`-ola+U
zb+rb1cUXCO`K&wc{&wE@k-gFt_M1jM{GTxU1Kyw{i}AO-*eTxLBRQu1L|Ff!5G_Ix
z7{{_4&*2zWQeA)4#TFDCO-V_*kj-8y?APnCML%wQZ|RzTUDF=BWpcNaIB<i67-89#
zeT#lr7aM7L=fSyFlR=WDK!|;M;gVf>g%bw%{>P11MOe*Rt=8=ge6f1V=f7+okeU9{
z-*1k#noN4_)`P_h7JW<L_}dT7>6Q$xdkxJ9XZ*szXa(o&r|k#c{dN_w10t>Fg3_wX
z`*weF&UJw3yfUotdkVAvxkcXuSw6yQx_{clUg^mMhPi#gcbBf*cIc>4PsP~GPt3VK
zA<BxOn8WLT{mbRM@{i9PH~jt^CxO#xRN1del_CK9Du?%AN%eQT^VS?J^o1CLKv6AB
z7YMi&X`~XPY*{HW6Z&?&VMwpGKG^eC%d)@9;TiwJe;>GJ#GPYckpIieHvYW-=;r7C
zMko<~pkul2)h{HCy!?qd3)k#<^Ox<{5ALySPr=v!xUpa7L=7y}=pXRdJ74`{Mx;s4
zixP&Rk|0|RRC2WSonLl*uzL5V=kJQOnZTOg?#y}Wi`D;oWafa*NwUI>vU=B-TMsVY
zxc|yt>95V1WH+9xHC9<!Ipp%upWQn0@~jLRvfnc5;kTRFF9>4l9WT9o_w{qe4`KbG
zwmw}aOIGp^YX5M0BuNn~wjY}N!HRv^xoN4%7qZJsh5dkk|NL<2&iuk##*e)9nvpaE
zL)&yDQdZ;m#~Yuq0{`mYZ|<I!1it**-8oM!{5mN*@~wyFbczS&VR*bBt*CcA{PyQY
z9r5g)(-Wg3F%;REd-R1*mjU+w>)xBYB*mQt;TVE?gTc)Q4u7;_y)O_#P;}DpKKI=)
zQAd&Bazqd<j-oIki9dts5N7|gi@xy$>31Hybx@CV9>yxjIvicIC-2GkzmB$9-hb?l
z)VN4qfL{vmd+Da#FE3s;q-U2m9+-n7sF6ZD0Q+m)zANT0N{lc8!z;o>MVpO!l1PiT
zQ6!cUZMRYs=<!G#R)E0}47?{$1avo=i)ds|)n6!ZSvPnB2Z}2{T)Qv7qBg*?z;K7-
zec+F*!EnSxlGhFHF?DFK%%o`BMUpePW!b;|Na@`FeD~?SHxAEC0Y84eIs3V9*KU0g
z`2AM!<IW?cH@x=c3pb6sW9;BNKlr}U8(i|>j3>Wbz596OhG*_DP^8-*>i_6}KAd;`
zxZYjB`V|h({IAx`y1d`mK3(R%`<>Gt`u5?Oq!Pye=w~Z;94lM@&pQgL8eU(q^_gj-
zGm~O8%Jjlzd-K14bXJ$Ra~+N;Dk>T|di=L{j~UrJ19Ydj%{=!0H<F`mP3#xN)VVLe
z^S8;f#|`%TL#IV%o&2BN^z!S%v-gCj{?_kpO>>5LFpVr}P)v)HT3M19T7zhRZ=2x7
zq}MIqe(3f^Kkqw`qq6^;1ej1#T@S4L*hpK1N#EcN@&eB?e67p#+LtT*LHf~~uN^lu
z3*ht3uYX#yZs(YRy`H}HI{k?t0A&j(@b$q@fsh2YL&B->=fr*-MG7lw-}!FU?!4pO
zQsZMIY};~<bW4tZV$OA$DG8!1H+p@?N~<vp>79{wdY*tL_AdnN4{cfaJfhJy$K+=p
zEqVIGukB{j;sy6;PlPK17Ovf!|L8lPcT0)?<mtZy{uw$7`#aBjRY~Ory|-{Y7ify`
zqz3TMV4zZ?tY$qG1^(f9T!cBsV$kC_unY*$2s%Pf;(7ujF&s1zMd08J>Mwyg->Uvl
zMYWDi`6cUfju+NCj@LRtuLW$#a3n0vu$|+hZyVMBvdpxu2~p7&V_P5W+0^3fpZV4|
zlLvL5F|7CS?x~&tJ>i8<>}JF5qxyM6?E7o>WF|&`de03giuSnwjY$K$|7+I75X+B#
zc2VaT`@(rQ7)avoh0AsxE5Co@aEo5Ic=P@M&42aajfoM~Zuh-*{or0N-vaB?bAmA9
z=@0s*BrN><^~H6LYhGU5Eg^c^aL^TtAFkfhD<$^dw_atW&NZ$5@#Dv@96Nqus;OIA
z94E*Y1Nr(7+s5_l`r$)2HM9TiyC%;WJJ1)Pe;f7>`xH?lD&3<NI87KF00s-e60jU4
zl<8Hn6fO^lVGzaOBW325Mie2MQ4G<dnsEC!|Dv)r_>r<4c2)up7orHv`XYmUa=P9V
zbiJ?S%W{KTzC?8<*uS6Ycfplu)R8N8<=npTm;Ko)`_EYh9airS?9MB!XmA!))-eq0
z^!Y*KJ3T&vz#o};?U=zmML~XP@v>!`_uM{t)LoOuo@E^V9N7<i`VhlD@!r?F^9zS(
zb-j1m#7K+j@%O&SK3X=gN9Pyrx+x{j{N2Xv=Rf`~%4YuY)yJR}c<Mfz+3ya9HoW%?
zfudo?hhy@e{%{Fk|ECN7j-6PpBd|4lavy*9%gofokDtC9EbkcX&nd0#bLUH#&8*B<
zgsG8%83bR$KPsk=@<2pZUf}!YH{jRpCcT*=trSV=aFdR-8T6o`BF%aONrH}vB(Qj!
z3D`*%u#R45q{v9KkpgXvbdU&eRevaOwd9u7Y|Ja(S5&>XsH(Ejts`&(M@3NzvYej4
zuk6)1BR*<Uziz|3r=TqnXx-v!S@s86e$krUzhoa9+b8|;DOZ3cdyiKxT)U^v?E_zb
zP?w~6<Az39^iF^9k;T8v7};lhpA4|hmV)B9e%kT)^;ZnZNUm_W-~MR_u&98I-Zehz
z{;P(i#YQ-M{zpDtdCL|3#wgDN7?z*6=!c2@yG|e08~nKAX!$4W_cr>10I0oF<L(|i
zI3dDv?k(VQx&HO9e{I~lJq#+GAhP%Gn>#Y|-3M+wJNsn-Z&^T0ky@i&BWq>OC%GG8
zXN=*9EnbTelB-H)LK=)fY_SMwket<^r$MZ-vOr6IH)@Ro2P}CTWZJ7mad<I_9Ab{q
z8mz!Lkh!4bs#E4=Yh?t%A`_w6;AuqkCJk`+1)l6R3XAK^RUP9Bwn*N3U`ZemiCRJj
z*QW!LyB73t8pZ7~>Ijwn=O@t7?>$`n&NnNItLworWpzr9wwvQ3Z4Aq8I8Y#p;-fcD
z4zvH|&z60+VfTXDr`|MnaDYDLcu*yUZ#aYRIkF$PnBQ;O_u5x0lA|M^zV*7^8OgjL
zl~mWivE=7{M~X-F@A=~2W-Q;j@69DY_w1bV)$@0`e7|i0mHlD)g7oHs(gvU3<!8J!
zQ{(i49=XQlbNFe8FW?E%NVrmhViSb_fOm-8l=mP!j;LJU8mY;?3a84Jfl-ZNT9Zx(
zR?`tEiDD!HpAg^(sv}X0fr>O4Vk`!eK^JE+>q#Od+7f9rM3})U*q`~Ts@k`L4BOxf
zF5g@5)4n4+j+BGz2Lm=Bf(W9-3PMtZB{j-6I3sz=;LO2YE~xWuakY4?AF!YOG>c=H
z{zQ%)&;m3ofLjV!(&1;)k|+f@Uayo_2(RM~(mEWaa2(u%f&_Cc%96%F5twpd$`Y_X
zilLaY3pqeD(9Ak9LdS9f!wcY+HR^N-a^7Qh4Djk$HvHaVv5XowzOzvO(UZ5Gh5Zs>
zErJl;hU4k|wUkL=e~>NQ%OBpNK}l-xHF$DQ`rsNiXFb(#0^Vzs+*kn|bRw}kIb;fe
zqRcU<Ee>o&@HVnrzm{kp=v|1e!^!SLwK^l@H_LE^JGuNV@F4+dG+uqL7SJN-M9k3|
z*c)lII=vDVrV*Sq^v<P3mw|ZaK3WvCFe39bcX$VPY=;bE{xn8Xr?US%1nygx&;QVy
zpO@7&Tz|#jn<tF47%6aq2?DFEZ+zsPF925_yJZSszaU7jeE!1^oAzEkqW>c^uL1sG
z^MpNMx;njn@Qos@CR~{ce=h7d>9OZO{$bgcy;lz(@Y1{+!rTS7$>Gw<mlyv~TGw#x
z@Bu-VUA^nj9oLS&XWEz`-74&V;lDpF+q`Guuzvq{&opn~SN0z`QvC2cpM!&M|Li|S
zkl*l+*p%G}rfu*AnM#Mp6JqK;0npvlxc#+mZ-^C28{8}}2%;bevLMU6D9Vz=gVQNV
zoG1#S++-?*Z@yng_`)N<a9TJ5pXz~s8dS-u1VErX=Yu~ObVORTJ;FpqnhjP1Wz|!u
z(H06PQlo7K9f_j|_zSp!L3U_J<-Hn$aIpwnSS59?O?k!t+mUm;&H)g`39xVg=mapW
z7Tm?rR^!ae`b_AP(K!~Ff~{C?^&HhMYWcJ?jk{`6B3<>#Q*#5L;}gWx+X4HhoWgzy
zpkE-n57+e_CwUs!!rdASBfE`2^ybj!FU6_?G^vMv=u|1_<tp-YSz{!>GXwzdbx;E)
z5M77LlE|0mNdY&WmW3K4*kgN`o!{Zzhf{+mOMVx7beBc~Zv7Z7NzuE06iW}F2^ogb
zbD<v+^au@QqO%6EMuT$%dNQu)fYey7K?!Qe6magC1HXW-Lf3nYM&#)o-wUp4ErzM=
zKR*E<eRKBVmll5?X*GZL&wId48_w7R`sU*5hDYB1JjgJR{eyY})BcNJw!Zz{%KqIt
z{omhjv6=PZRDFP;wGQ|7kG}>i$FH7$AR{>rMYKC|Ap3!jcy{(bv*`Qf+YgK#)a%8&
zW`x7^z;XkH<rP&w`_Z@Mbq*9oOnU0QN9U%+$DF>A3unJxhrapssxQ`T8#^fLrMqV|
z=MggKFwmzx`ra2ANx%C2`w9U2|L69HHop4|u#MU&`_ErA%QyN0z=^4K`x&0EaQbML
zt8@FoUtmB0Tfs+jEYEX-6l6G>=NMM>(KI+;2*iC;en%vnJyFy2vsSsnp%o!FRb^}*
zcL47|pAu;^B?6;>AY!a0;Mt@^T1<MKRj*5qv;tRBPm%^5PW>8!Ku|bJ{!tQBRc)dJ
z#nnF_IJ)n6MPZ!-7z7jn!%D#o2V7Xt8D^!%-!iiI<vlwmN0=kbZ59)tvj23A9e-7t
zShwhzJ2l<_Fx{eOJ^S_@*PhINSroOD-Z1)ht<lC8?&V5zv^c5Dx)O~{Vh*lm^0%QF
zK@6BgbQu8sa<7Bk_mfy#tToxFkvCxx@l5_!HhZN;6m)~9keMU6@_c63_lB`|pb-h&
z(cN6>A<%8;29C!fQ<#FS?4ISiA=i_c!-d8wde4utvl4V3M8-hf@TrosF1Y#wnGNEd
z`szpC1WqDXe2}j?BD1tcU?i5-3M}Z0jXIV6=Oe)Wtp|_1xcK`Bi|O+P_o5gId_WzE
z(G35^>TOHbZ6{Fdk(;jt?B@j;c;pYfwU`(9f6kpYacCCHhXd%eKkvwS{JpQbCnvmm
z|ExH>6<{*^SjjUVeGAC_@B3zVNr`3HGaNkN>j%p=eDd?Agh=aSv!;y5N)N9EZcmF|
z^Wx$)U$5B)@Z6_s>idt}MziASd4VpR{Um`d+r0PnFMrBNiGT5LH>Jc{c>%1ZscsCs
z^}k;>?$6Kcl=#oNQvv${*Yb-ip8I%dWutTbI}5BPiiHmHUxEL&o#O`_0*5~o<k%1^
zdV;|K%>gT+!58EN0hkJ&ARS;hUJ&a2L5DX;^SnDiga1L}AkdT7hE0NS<iW2$YPHQV
z2hcJ&QMDPCos}dEL99l~rq=;~QBM#?J&Z*#Qe*`9M-X;{&T7=#j3lB#Z6;l;*<_>$
zGcXo#)iP*H@q)DDXzAvHlI)V|ZHLRiRiOu`1VwpK1cqZmgf%NAZe;h=%QMsZrzigP
zho`dtbX_@SoYhzM?|IXF;C7azu?s$a=dNq#j5&$@Jcrm~4HNH%&-f%>MlcxQ$1(tQ
z^uaa!@qJ(wGHW!MH3~3~-TyORv`>Q&h|xlgz8%$@0_*=P)t3R->H1G3x((%y?`IEh
z)sMX$(OWbE+!`ErWEe@}*<*Y7gRAso@4%CKaD{uB{VQY?*J@DRfQdxs{#?mHX7>-^
z+<{I)*YirO%OF%|06nizbDYUuC3_pR1f`DmpSJ-2N^9yLdh2svh@L)r$gByMMcb*(
z*~fp{eqejfQAw0R&-?I9$bOC&L3jWD_iL7{*#TI1=e47T_R1hIZ1tYpFIR7Mdi?iH
z9Y1sY2tkwp`<*`jJ^%XH;qgx&Gi=s`k#-9eq<=d}53Cjl(X*d_x54G@6dyC?@<ACX
zv052%1p<5W3wP%quXlQZ|7bSq@0>jP=J7-PA+g0{{Q#e;>Ya06dY>1>EBa=R8`>K<
znmNbIw;Vd`_W1$Bx}_w9*$-}oihAdZpDfKQuDEN;*daaBBQ53#tLZPme|w(aAD;XT
zEc775hd54<M4A;?UZQy}K(od5t{}}h1EG44-_LMW4v#y;I6Ofg!(xcG=_vtxWFW&p
z1Vj@KQ4HT5O87z3t40Yv2nQtqr;@_4a3K?lfcd0x4AJ4Z4#x-#H5+tEcC+22H|cZ<
zc8iH3Vr`}<IPPHlE6b-|%5nulwQm2KgN5JhIJCE@no>r)p<k#xD2lh4dM3wQ-m}xy
zeY<yxi}=fqN@f4)y8X^OKl%7$0|M@12zObFEPiOlO;`3ih5hyf!-Tt}kcU0AUh+FN
z7>UQFBL*{DxKF4pBDxHu`c2T1aGobqu#L@K4{lv7zKj0yTO^(dt@&7TRcZ`&{be(-
z=v4a98lk$-FnX>A#Q@7C8g%ano|vg2DWNo1s4p{2ybHC(vj^95M|R54Q9!BTQ}KkZ
z^r5w2?}#-Dp)BA`fW81r=z@11h?;C%aW=E>C*V!U2x25v_MfMKm;dGJ9iRWQ33Suo
zHigr`wVD{46|nq+Woyf88=t=IhVetP7)}7B^!fsyuimm|cP_(o;gMiT6io*Gn1NX{
z#|@9PnZpAC;D-Nd?T(L^Z)7+a4g<dG%6~seoK}<%?Cx-J<vUAP7gyE_itDF=HUWHu
z3>@nzmk$Q-|NGNMu;0gTnS6Qw?l3^@q+bUc1SZ1M?|<bEgtjeup*d$TSiUF!_~Y+=
z5n(oe`P}`07R~Hmxhv<1w-$FvOZ@zq`@+isFh2cx)8ds|L{S3Y2*u!#b(f^L2}Aq5
z_SK5?gxDAVHX}aT4#&VH`I~h+zgWFB$gmVaJUnyKq!Inu)-QeCp4oqng0&tbjpFOW
zxjP}2clkq=E?``7)ox%32xX0KZ;18;81M$H%=4hL6-80zB^W-+!fc%yRtZEnL2&JF
zLJ*pk!skXv*seg8^yDaf%n*)?(%MbB)F`{%2u%Tl4v)84;_bk=#nYngBrI`@VVD-v
zYBBiCT7wffqP?i{o3}H8JG!jiwJQ7Ams=0kx_v&H3*#T0HQ3G?WKh?{tNM1I*e@f>
zY&25jh41k!Y9;oAZX8^kvIw}@iU$uad;-I9yoKXugTbKF<&t5Si-u@+$bi0Y&KWyr
z{E(BL`SS>6qQ=Zc^j7xJT0m|ru^aI6rGPiM;lEf!l5W@y8lGixH<7(ZX=Rb#{*6#u
ztm`*{=y?VBeYQATD9*-{GV$)iK;s8~`i~Yx^do1$A%p$D2sK4WTpHD9q7?Gc8$Z>M
zdc&C8B}t$+eI~l=!QHDx2x{bXJhl^Ce2_2Ls?q87!>5DWo8GY$;Fjz=j!5pomE|zI
ze^lB3C#6+V`)$i9+<Nec!yC}yXjVqr<$b$nB${9NWOakbbNv;AdUr_+54Qso9(1Pr
z4j1h%D6D93gC&WPmMaJJ8rmzx8<4`&>zlW^Z1esDM@wmrlO^%FJ7<6{_4IEIcuceW
z_MD@+$IBZ%UP;#4OjPHD*kM`S(vxf~tND7(_Ps|+>}JEvaU+srqndXSKA7VbHA~j*
z2r=AqcTNL6cX<2Z<zKDYW;W>WnLYti%JFHGG(?Avl`dYjH6=P?-n8-I<w~fUcJto6
zy+@1uLB>GodUr_~JGghf%e`>fy0}RD^s&Pt%tioyuqJ@Qy8Zb(4jvXIaq{JZ`*-Vf
z_RNDVq7K0R^Oc`wfN|goFb!URh!ZOuF3`i4HvoRX(FAZs`DtKuiL4+6Xcm}af+Pde
zL6XB2CYvG@5$#E_2ufcjofLuq!x2urk(G-oG0GZcHkc?~jMb>evE)dY94Xdjj<cD7
z8KEZ#lMW|woWKwRN$3ayeA;13-L$qg7nH0&bbL=y#gVFp25(SD;L7X(a4LBdN%l`q
zxNX#cf$53Kkyc&nM0K<h`@R0)>+k<JJGTHOus`h5|D1s3N(Y0t&YU`P%B0rVepsLv
zi?Z6@oHc6B#9=4J`Aag#V_gR7`iw;gUD(o=m>>&*=hlzD4WN$Mx12w+M>ljb*=;yi
zUC3<t8oVdc`U?I=RBzX!;nw4reLo8&hoCK<+>`1z5l-O??|@^tL+jWh+px|93?rtC
zephJYCsHT?ll5!0x_%Rh^!{Kqp|&Wr`3tiDBqC!FwE1~ni{nzzOYdGL)D<J}nZG)}
z|IZ5e%HXC8PaEL{4F?@A$)JOCPZ&-<nQ%(QCyc<F^EAL>J-l#+!vkCgrm7ZB1=R#-
z@%O$m=$b)SfSpME0M-I~3zz-XDvu?=Mnp-&u+quk4+~(PBH@Mln-PKVasY!6Bb>Zj
z_{5mhf~}pr9L_Gme?3%Q1Yci-nWDiKBn{2|I<60nhg7~4JnN@6OY7+pWdGR(fZ_Ck
z6UYS_(di4gLtuea@9}|a#^DbEyTKi#MM(yBgFDE8&m6u`fMIx1vEGyb#pcWh%0mK8
zcn+j^A|_GEigY3)f+Q>Hs^El1yHRhVNF9#Z4LV>VfcG|FDiXLdkwItF!Kd5NCPS13
zPTB%X!ZWp0HOmvEH|Le?E2-L;Ta;T?Bgz_z#KBnrW*ox{y;I_Pr^b!UOdFh@)IBM>
zWp~|5>~C<orp%tVYuzS1I$r5KT3DcK<0L^-Q}NhK&%XN9LoL1eU&U~ITy)f1vqqoF
ze)wEh1^|yrMk13C6a%Q19F;<2Iq>Gus07Jb$5$Q(>u3!&yvqO$6W}Y4P$Q-zws`Kq
zFM!QxRH7sblCxH<D{ii?jz*^-Q7Le1;3HS1SW^N{2pXA$#;3~xH(y?$m0>>*?m@(0
z!{WMVbw<HeB~}(_Nh6xj6*WgH6Z}}AzDRObE9M2FvR_pPl*<0|5)3W_wg50U8J^`i
zUXVi^#|Z+>3;rN;tj58KVuL5>@CDt$P?gi;XM|dphZQ9_hgvZL03c-0;er2@K!oPR
zVJ8wHXqq!3z*k8I<{+g+8-O(pI^00ONOxdO0xJQ$nkh0Z#%`pD2%|p6YBK4F#7HYJ
z8f@pBiM`_qMt%h8ueKcgZdYDuqnpCRQv+mPl=LJKXEXLniM_gi=G0+5$ref!(Ms%Z
z^muQad*7~{ymjAw5Cum(Ti6)@rsk?Gd+xn|!Gb3re*V#WTXy3oE6o14=Zu~`euO`G
zYCeD90LgHa9pYFqoHZI$qm`hGhQPc!;EsVS2%;>BSVX*j{5*}wg|>VRc&`aV5e}Z6
z<Sc4L=*)#j?3zE4L^zfY6ZK%frtF1jg<#GRxE-)uQht=ArX$fPdp|MU0%QGl=Bhb;
zRdpPx>~C?Mm|>uS84cibOgLl!tbuxOz(;dcPA|jr<@KH*%hq@T)ebMrKPJG(PQb8G
zo(D*b0B0w#(8iFPvmS(>8=V|e2!B-KS`g*&5)46*;VYtqHUh8LZqz47SqwTN&T6Iz
zEH&C@HR>!nU22RStOnB_AaIxsSLRIM1OfvhJCJCO%Pp&0wmW~#p~41lAfV*B(IUX{
zla+)w$Q3=)ZW__|vdmN+fm0VS9MF2~pML9od-HQ@4t-=X>R>vW78bQ~^S((_o}T~o
zBQHF9U#px0MNNMHZR6%#HG&I-PK|OoLE`Y2Lr(e;(VWQVuZ2GfEb!gY5i@1KQ{1~$
zayM!aT#3;-DgAy^+M^TrLr(grY5Ql|FVeh3{<V!y{#}iRAjuzh9=iR#ANObHsq9zP
z5u~!eJ;>z`0aq2+2i_p<Ww^>l56p$&@Pt@C$Z+6QdAcAmoZt)5EHAL41ndbV{gJjQ
z4T7fmO;cP#IC)}oy5*B|Applniy<kn#0F(G!um`~76dCGVsfO_VgP0Xoa|tta2*cE
z8caGI7!WWI<3$^Suwrk)s)L2csv8eiHF$z_xcU?)h@g3ag*3iz#*M?Xx+O+;iH-ad
zDeYR7{d@9q%J&1mpHjxnTUNT|>$9g^zhM4?)@47qc<pw3R*b%HmlTHO&YWfI6x23>
zIBe-QEt5pV9!JD?69Z16`WQ^k57UO8VLQKWN&fBf-!9agRxPOc&`w!RCUER%MeTt~
zU*3_!Dak7PRbBj2+28)<XBcmg39&+eW*hv$Aj8s}(BSa{bODRO69k{qL5BCzv^T^C
zXs*#004s~K1n`D74Kyh85q=FsfIXZjg{MY@lOJfr6IGgkxnLkMt3e+DXEVYO1e;NB
zH^GR5D6;|l7-7_#3_2@C8g!&d4{J2&2`s{FXzO<RV>NmL8}f=bA1>Z=ysDtQ9>ZXP
zcJQsCImHY8IwcJ1k~pw)(%9Zz;_c?MZ%UG+L#1_{<7^f^^;faK#r=L#VgEnpFL=0B
z*$*zSS6_WKC%-^*J~I}=={f++ehr?20Wv%sD8pblr*vYl?3CYvb8t{#=ZeaXPg<@;
zQLQqe>JMuJ{@&M+GP`ws@PP-6#tX_KrLtdDyO7HMi(PO^0x!T<PGAI{<t3O4K@dYc
z@AL+b*E&EKS?BUQ{lP|mu-+5&Gi<ffD@nj0P>L>S6lTdv=z&&N41=Z)M3PUbzo0qM
z)lwt@9}M6qfx(Fh@F@X;7%2irQHmgpI-*-*6oH|@P>3`cZAL07(hO_}i=n-QAc%rk
z<MHo3Ub*UE;j-O%L7K-2nEqMGHZAFKymPEQGdcR^k$uMX>H2#c5kzU+^B-p>#=LRo
zWGrVb8}Mmb>6w3CPQ`)$L|Dm|W<NNg)Xy6ZUUS{E^A|kYs_g&M`p>FetSCKl_3+a_
zdV0zF^jQ0CSN8kux02=i7q7gmd&;ySJx^Zl4zYjxaM_!4uSu}BaZN~-{i@o9RQ6vA
zI+5>Kg5hFXSX;pxWF5X>qc7kJ&^0a}!wVG+?jXyRIXrGZ4SFMv2XvF+(*s%JB~gGW
zGBm6pYEODBdwLcG(D9(*`U^<Ycm((#&Zd^(P=uKxfi)0khcSt9cC#KQQlhO<7K7cW
zkF&ygS&9{*#o;e#F)e~)NH~7G)fz@_@CLUXDO<V!=(b~}{vhjTx$rJQYf%KL)k-%G
z?=yW^ugv6FvyN0EE1-BmD5z{0^!U5rKxd8_@>);Htl@Y~Yq0;`U3vL?cKu+r7zDnB
z*>5x$eqOWXjvMb;F#nO~+a>$|{?YO*U4=u_QQM^il@g>=rKO~sNtb|>l#+t<=w_n?
zC8fKO?rxBj?rxA~j2tnxeS6>U{SSWe+<VV+?m6dq)d%w2yq*Q5-_5WEXz!Y~`t$;m
zL(1LHUqaPsS59g<QnETaX0}gm|E{i@cD394>#l-0{`Zl}#S4ztLA>fB-g2*#%Wr!h
z@Ly|`*tpMTK5es9d_k3bg`FTnif|6ZOtyT=z9soO8YXWs;G8z(OdhuX0;jfgs-A2j
zI5&<tvp<2Uf`m1o&=8pC&Kkm_kQPU5LMX1SxnsTe3|oOzH2Ce$WIj@b25#N?#UsM1
z39+u@t!c~o=<oAHX2<NFPkYZL)!kI4n{4}hqoG;T0xZHCPDpc2WI4b=%xF1qyF{R8
zV_xF;MoYz)U^|h$J~`^D`)0#2{Q1J^nnUiGMWppTj=Ix38xf~9*tvQ=1($bAKpko$
z7KF1nU;NDiHP6dEp&IRXzkktfNMvMtrvufbUAWq|=LPuq{O%Jt{OKhkb=&Op487C9
z%C(oPj~`jo;$y85?`klTzg6Bo@z!t;lXU__$z9`8r#=lbm*avG&Pb<Ya<)IZzrgkR
z^L15~6lZVzrxC%nK9$=mwebPU#Ig_CcjT#0g_+15`F`P;&nDC)<}BwHD`N*K8Aszp
ztf(hGNw&{tr+3Ozj?CuoT?FXtKn>J)!`61tOgEld`3Q|<&&Jd?m1y9LA#7e$9fPf$
zfOclkl)iC&A1k5gT<$A$;@<I59v?2jlRKbcH+U#7*h4}jHR_X=!?cOf!be>HzA$d$
zvD8j88HpSC<OFh~)P*<eCxR*47hE=i+|&_(GlIA8TfHWp;;u9a={BZ)U>z^7?zp{J
zMQJTpm;LUqq2pBTzz{h3-gjtY@&adisKx``td4R<SbXn0(JGP__LZwLD_Pi6`p8`-
z$@S5dk=+Gv9N18-YyOdxG1O08G&AqJ&b*7NiuZf78s>s{63pq7t(REmTA{;JEIe3L
zw6)fji31p*sCsIG#EC-q`sZ#gwmCmU`*Y4@he^ZQ+jqYciLmf`BRiOFIj}<FNL_W3
zX+%Ze3^(6Id3zdEU@itjBUl=6;l}(EjDp!FR$$|jfSR0x1F6TUIonyAbf?w*kMo~q
zXbBhtYMh6^MgFlWs56~6{e2$VS0w*8w|hr^m@@xFDoBxcEb1S@6eyj;-WjtXui_8n
zn*mAD?mreo&Yr|f(xIOtB|-h?q^{Z@dX5ZV^h`cQ$7_xnQS0=+_{c(rIdAuofKNxa
z!4S>auSDxs43w}XzQC#H8?=jROx+?9bS(h1u)^=^q<jJyxp-@_@LsaNzqC(32nz|x
z6FUM1yfLL>Q2U!NoX1&JhYJ2(*WI`j8&~+^sg3Tr=?*{DyH1m*)9A`<v>Z<A;n`Zk
z)qK};InNU-(&KGgwYH7L8Wd|yMI`;@Y3*lGoFGaT@DFnwKDyvQLA@^@S%n1;<UWRV
z_UX`Tl37gsP$zkMPH#2k5!0lUTGFp<-!S~btTlM#fk`h~b!rosFc4_!5*o^eJ0J-V
zBvm^tQa=hv=0z(nKF`JPBBHZpG|Ntj>jAg#R7o9IpF?EodcHHIG*Wq*%xX%c%J(3L
z_0h=1gD+z538Olm%?3FErVd8ft8<GG-mPzj=_HbWe)TDZi^f>r558xAU*GitKjBRc
zU2tYRY)b@~t{zk)gfaGnP9dWJp-E}L-9M^HSOWm9lY84$*V8*2*E60MA!2U<*GHOy
zL*Vn~6HS!ONV(}Xl&sloC3P0i=78)y>Z?D?621QSy@&1;84HF6+-v~G)}~hMEMZ;@
zJ|4q)ay0oKYQ{}|e?IXOc0d2~kvqZ%<~V(qnZkp#^?khND5vb+To}qz)l*aZG{VzU
z>nqmpDj5X`R3R~tIiLAZ*|)ld?TKN$cYbGBHH}}-D-j@*0w`3F=`JT+{Dy^i8_{z*
zBpUWM<azvzPUg%PWz`8rmBhhYWn{NRm1bleVuJ+n40gpm2D<C0rmxG{{f>hCt@QBW
z#$ET4xbl`s7tYuJMPlJNsDfzYSxq{4R`;pEX*yl1*7Ir}(e8tVwv5gFIO9MIw7&T;
zWabhto<Wl61j_UeUiF=*pgjd-3XiWJ*7e$<bQ0_<7Efy*|H(+Tu{AlX6uart)^9Qr
zye+Uualdyyhdd)t>AY2+;_#L|-3M=ozmtlCtCbxz%Bj?a=A{Nb2;@zd>3U%N%&!PE
z8207rBftZNm3#VjzDBp0xHc3^IPl0aPs&fdXXMEiGno^h?@0c>$a>zBE7;mFv&+7#
z%)tZ_B6fe$1Qbkne6!hGUyA`>1TI}~MXj!^(sfgIt)EX;FH&JNn`?O)K;HCU#D138
z8Ay8xMR^bO&Px#DKG)y(bj_#AU-p7#x}F?jx@{uPRv|u$QbCqKnC(PmdiWfWbDYwv
z?&mh@ok6dHp1g|C{<W+9roz~?P2zR$4C`i9G<p*=<a0B9A^&XYE_}l~*URp+F=NbI
zy|R@!3~m~KOw%71xUGuaU`vw=HriXxfpqYq3u!L2x)nGx^{puO4~PxX(^>~U3-GIR
zyA%P@n!kI>Eipt~V_J<eznVyML-K(^C;L|bQ%rr(c>)x(TuqZSm%~uzl$7><8Blbq
z8Z%y9(hW<}r0w`~t|XHT&7y(NCNwJ=MrB+^ItAQH#O7e0F81`EoQ8I-Cj-Rbl{w9T
z`+2)gel6_B2oZVIW`zo|sYG36SJns;mTwJX^hO{-S)?lu&bp=A^VU00Y8Un@J>erX
z9Uctetf9rNXLQwWy28E)!GmII1MRe)Ni0xmt1{VHTis-RJ=@9oCU=0l^eI5MasOqz
z7H=eX=A>!)BBH@+v(YWb26&3+dk<H~Pp)XB^-z`iaIvA%XPU}KBq(uM&WN&@hnIU&
zySEV7+o?@zFe0WJ-+gBsy}F{{L0KQXejd(39YTPb#}+XGG%Us)hZ%1?7S%Lqg<A>^
zu_>&8d|f?#KYuk25K2No6kOa)|H+fopqBh%zxHR5VLhyt%9iPt9@~_;!6Qz-JW0A4
zvd`R$b_xd^78?S8-Gm-ujH>2Gv0FZlCp>u%sKmj}Ze3#BlIB!)Ys)ZgbxGvtWZA#t
z0D<q@2DEUcHfK(UIMy$@{ffo{I_T@VTNe24=O@*U*O-I)jm7|yeC5Wnv*(Y<ow76_
zPpKOov*Czp4E!>PR$@Rg6tH6S(HCKFigA!_eQ&<5-39hf)+5fi-h+yC-62?eK*_s|
zZPx$r1%^;Z*zL~f)>fdi+jsFe<U3B8-u~Si+RMoIkuJfa%uLH=Y(dLtPm@LHP1b6)
z!nB&unPyTM5wo+GTSw1RtoO1qR_l3nZ8N_5Nwda#*JAz{uldpPl4p1G7H-)XHuu<L
z1?)j@#IXc^)o7DMywb9kYaCrXys_8l0}xzN_D>HdSF`m~Yx}(#HvA@5cx3^e9@J#I
z;YN*_0&|lc*Sz{1V;d!4oR1H;n#MVBYbLhmSH&^e01w@(@Z78%#OBIk3b5em<3QT(
zzPcz&Hh5umlWQ5oKI7nhwNMVES^f&r6JJX(G4XGhnU>%_tA`hVYtl+IHFyhO{SaP$
z94>t!eMt0?rIr01ZlC74!Ef(p-Y*N&7J=;#XkgVoB~_~Z`<OQ+UOCm{K^8Na6X>od
z+ObD?)@$_5$+lA91=g*2Oy1!$pvg-fjgq;&1fh5$rV2%aB+)@T!T7Jm27cDsQQv3K
zch@oxdp&V_c=!s2ikjHn@!84@$!4EjJtDL?M=rRNRu8(eT3aPNc?C1r^L`^h1v1UP
z>s{UI8Jp7f_qu|<PIrrx8ep3t&Z`}V;tyk-Wp1Ba&9safAVX}{tLnvMu^TRh?sN5w
zrt`eJ34TS(Q4jljygch@gT>0Cwdu{@f1{HZ7)C1Sm)5S|MDDkL@nJnC(4h-@@{82b
z^rHWC&Sv!<Xpr<)4_qoLV{?v!`Fe!TsprlYm`l*%`xr>akbd!m%CP(o#*r3|lDP>Z
z357tS_nkg7pJ1;;lrZfnZkfG>Zst0f75|`-x(Z1$!|=2yuH5zFKAF=vuVIF<dqWxK
zbs0_=7A|%H=ENm{@02)sHggFuczzLDs6i3Dcb`0Z<hMDIxwifb;ajV)OiYEj)NO}A
zN)!XurI(4XADd4?=upSpFEvqz&`8iB9j!}SIBr$=GV0#*Lp6_Al-Tz;7LSQ`X#>~<
zeS$#3%FNMX$tlp7o~x1p`47)G{x15SDEvfpdziM$`#4Ms?750wt9A-}9UniM7=N61
zE-b5^re$|@zqg687Sj*JWdNLSBpvvyI1jvv=HJu6aB6@0loKl!Fzp1Pvxjbqz2;sa
zI)!_;8)|Nm{TwogVpBS&FRd_b;{?OwW->3u0e!jWd_l?N-jx-d9ZGEauM&+-)N<Kn
zwgMj;USa;@rAZNFpXiC;HXc2Qx9*nc{}D`OE1ZecDiFar6>GmBaEm{Y404_y+T`N(
z@{VV!aAfF5V=8Wzr#sk@U?lV*K#_jhzb}M!SF}t2Io*q6|2N+9|M`ENfk1UP$@7UI
zJkyQCGS|D9MKQDrvicS$NS`p-EttJGe9N_^rjlQhm=lp}gya1$@PaJJoZW8Wq>V-1
zQ~ifVY^|6257n5ubY+7(%Av`35R<U_QCWMp2<i0}sF^MI`Ku=~+IFzNWHH9-bLiuI
zJmVd+l<;L|k_5gzy`IzNO2~H_`PE@r9-0*XcZ$T3ca1iLo3I{`VLZ}3?`!gbe^;O)
zoK0u=m07pybhtA^(<X?S0}SFcsDJ&L;vw;O4L{i%+d=^5&^gdRy-2`B_<%DB;aGvb
z&$g`&*n1x4+)!#|-uki9`cK0&DpjzfGFBt0OidXRmv#B(yAAM1xeI)RcFiif3NbLB
zL<FoH<wVPm6yD*G(iiz^wW<An`l8|vUzzw()}UuGx1Rd?CrNVXQ@3I&zs-g_jcLyw
zl$#g^@Q1X4QN1{Psc!uV3!&41(|dWXeT6)o4_`mojTM5{S&7F4v)G@H;h&TAqDcdh
zjt7N<fBn2lN<=@4ZMsSqAvULrf6-lfST5hE-TO1Eqp}+TXh76AgXlYZy1fBb!;pO8
zdfVY%kwH`8(_WF{y6)&FMu&d}009l2)!F?SmI!o6>jGB`(@aJ8)_6I33-a0{oMO4*
z*&1n=`;axNzi?Q3g%cP_th{-M|N1GNI`Pna$ZZdMxJDQA_mK55U;Y(~i01V80<KzN
zw~BcjWk$S>r6@FHo0$9b8T#2{YiT1t0<Fir$qtmE#w!$K$d%$0^E6mYJPEs|iI4ME
zbfT}cONBwJ89G2?j{A1PRKGL}v7u(Ccn@Fuy2#-~lk`GnpK}Q4gL2>|$yJ{Lj%S@&
z@jGortUo%u`7BhFEZ{Lo_iJ6y?mOpRD>+{O5cJMt(vHsw^u6NL{Ds*`U{j0H2LM=(
z73i#pKG#JbC`B0K7&9qz?N^*9XnnBVm#t5`pFwSZD7T*c-1m`(pWygLD{|y84fkl0
zd6T$XyL{5APHf@1AX!I?<<Wyu*lC?F)X8i`KK{FRKGYpHzEqQ5ZU9tQRFzDyjJ!Fw
zmtlK<6>O+vB9OaU!TAuhK9Dn?YUa~5vaKEwIrp`i;_&@d$VJQ$GIjxqfcG*&SP@fI
zDsDVbfR%HsoBxk)hw(7lqRpGNf@W!-Nryp3n?Jj#NaoQZ<yFYlrqLxLow3_Q)G2fX
z8I#`%<^IyfnbQBxByq=Kke+wbQbe1w%TTY3N9ZE|??sOUWk}S=nJum=mw&o*Nek_2
zCts@GKi#4eJ)_H1<Q<ORk>@=9==(OU$C*?~Wy_vPIaF(tL--BH3>{E_7@8hWVJUL>
zlt5-n=MJA}mQ=oUY#Zs`7nFoYq~DH(PNr}cWGp)2(~<Z6Ja=v0SwsY}5;>W^Dm_ww
znsU_VMP?!L*|7>fNDCEHd*h{}>c^Kg{kyb7<xr$HX?b4~4%Dg8R9X5!iJaiKa(wJ%
z00fqj-Evh5USst`P_FBMr>K+@9XBki1Df1#h}9^f^_Mrzan|C@HG5l;uao_Do61qr
zc~3Md-|EquaIxV~FUSdaoCW5XqZM{yEJ6x56PJpyvy`q+<a(lUYX;5yB{S8SvZ(U(
z_osF-4xB?xsw3;DJ|yx8aEME><}0}pY;`tzL_sZuqQh0o^sO1;E|ccz2OL=XJFq8G
zDxT5J#E0mpW`pXuG+VIKE`12X$Exube5q5g7@Jp&qY@Z)8E&BeW8p1{XFhwo-0SFy
zG=BE0R_a~f-~HKEyyh?fYAFl<EQcEdX6k@NH^AiRC3O$w?KUl_>n^Z`t#0~vLO#YO
zrjl!}rIT7Yp7d$`E|XHFj=Ej+^(O+CRQ&k@xomyRGirt^dxVJm#R3yiG=<L|1`$7f
zxf$_Joq;Zo5=Tpu6ytdMQ-GqZduW_$bkmSg^fZM|@H)ZT!-Di><r5Y(jMk^;k~CWH
zc}A*WwpHRS^X_9CU+0X{H;?qGX5*djN$w|9r@(XEPYj|oJKop20EVGydZiKYQUb9K
z&1m|INm`G7aw+jI0=e-2(6<%)_1;(I&1dP}<6{SmodrJ;j?#b59*tG3z~OpH;b-?-
zw7lz*+u3}G)dvIgAxLTA-Bmd8B-49~=RkjMu{~e{1M=I@^vAPWy1t9}3ayzHd-4VQ
zr1Trov2a4{t=ht;Zgx@Jf1Ds`r0Yz4tTPVpj+C(#6pgQ$VZ};^lbZ89P!S1N5xTeH
z+OtbrH+?u8Pv87_Gm39m>Y@{VO_!Cl!NjqU%f#bvivC~U4DZV;@t`Kx*rHJ82;!4u
zN#Ns)_W`@MFAwdHX*cig++>UKWA|Q&f?8_4k0>`g&->}GpnA)jg~HY-E0Kxu34rPj
zcsgLo>4^jn0dc%=mNHWmN?SNhmOub;?2^-olr?7jQP!>e<`?sH=F^93T+`f|1)r8W
zC$JWIkAL*)kZW)WbK5~uFl;=l$u;&bPQ(t?U1XeSKHCq`@=~450JDR_vHlYFa*he>
z;3qnb>6?;LNPM!~)&K=^QwOnz*SFq*+oCe>Ul_C|F*~;h9Ve3^mKswCCzQjiOpKk*
zTl?x!2)*{~d+T*_DTz(tHq_qwEP?oCR5@gFoKf^^@Sp<@H<4$qLIbJB->#lW%{9bz
zr%3rJae4IerhRY2;~Ey>GL|q9?}f8zBCI=fn3ycTI)mSKxmRT6K~%Rf0;4-<(F96}
z@n)gTtSs|wc=g2Yrk;Vx##XYt6;WqAIUAh^vC<)nn50Im9mz|DRD!mrOjgr6TS82)
z%iKQ5qv@S*+EMdUFgFW1F0YCr7afy}+xAOzTZ}c6Kt;nd^U_i1S~)69k{z8lzP@|9
zkH5KibbNlg2`Zd>X~ud3EmD}k1#W4=7=@ksZx+`jPgs+!l;cHP0DjY>DA2acJQStp
zIEIu?a2gwyh(c3B&uyuSDvdtrwZBj>Cu0dcq^-(oxei%eeX!AOjKVmw|6GF=5Y4Y)
z3b~$d3m9mO@im<~!gt9pnXAXF39qgr{ec&Lpb$sNGNr(#WTcbt$Y(8AF`r0d8QGYM
zDX&xD_{`I--Ph!hC^Y$O1n^P2u`;)!WkY3)Dk#6=RUooKry@Z9R^#P^b=2_j2a?Le
z19x}W=)qkSP91paFfCT{4m=4UB^AQey}D|1ymCrKJX#)mqpIyFPZxjeM7vqX(gU*f
zF2JP``6a(MLG_nI<XgD<%V4eYl|Ak%4+i#1bT_6aF69K+EwDBHaWmua%iM~PE<>sB
z>X5vd!OVn|g~9)oTAVdgpm?Cw=G-06MVuCeT;j5s$Z<5fELf^nLSPKl25p?c#oCdz
zIwP(4Uh?Mc2C3*z)CmmYHEP<bh14=e0c&{$^``aG2Q3m*>Eac)%hUAMs5v;)XTP;r
zRwii1^5hfH+Z(3hf?O#|%c}I|H`1klH_`nQ!3F=P#=8L;x;87RT=}F&J6gEdaS4Wk
zuNNd}$k6L*;>iOj=D-J&Xb!uLvD}lc+Jj<VBIc8L*=Gjl#jE@={4KY`etQjuyu}gG
z=X4gSteW9{!`HRKI?%+H1}eYU!b4F`g4V@2oOeV&lK&cPL~y@S%YJp$O)qBZAZf62
zh*d~w#)b7*CXS^RQ=`E!x7)-CS1|s7gIxSh<L(UuqaA#^In{I<Qa3St*h4oH?~W!N
zh=l7=hCRq%@3odzglHMEZ;WAm@}rca*ci@N@(PcLS}-@#b5C;<B<_0{h`;&O&m6@x
zDeblj5y1<QHEygdYf739?L;1frSB*G&`>1rGxEjzEH-&Oo2rIChMy1KmQ?WCUz5#?
z2jsmV&6bOOH9{rTagro|OM|IsWIIM?F7)ve)0iu8`lO9jzDRy~$1&N2V_GGS(lp@M
zs2Nb5@3|LbIeJT1<=jfh{EUnITpd#m4X3~T9@IFT^*F9EaddU497ld>r=*GY%|mxL
zUj={JFF?Cb)^;8bxF3Up!ji3>6=&9W@l4mN;YVpy6W<X=(M^N9&z`+;m0nI*`B3NZ
zvqt)Eq<4=~q#Fg})F(QgbpCwz(kw9JT36O#?A}S+s=LhdHJX=D!VL4c3$A(<P*GPq
zaZ-JIr*^VF?*vZjdub+S`b*}cqv&X7{0>pPW}+E~uht6=dJ@|iB^p9LPs`gooARHk
z4H>%*`-M;KwG*j-)m3({<>-5=+`yQu%Ijhw+!aq<B^7qI+Odg_o^>c`Ux$Le6cdly
zt@dVAjv&9ZinaPttW+kxYT$xfVgEL*wcg~OJ($cv{l$;n&_95oT(^`1w|fiN%rR%5
z%S}rSjeys@L^JjI{<<pC^S!B`^}rTjyN1#@A{%=q8S5DH8HcPTkt@ix0;*uu{52#3
zJVmpGz9^8Zt(~@(Vo^Egve8OBXyH_hzPoNbnDj!UOua^YG?^hc-_kK;Pk+C6Bjf~d
zJ}J+VPJb!OrGG$(ORbnJLp+;!Xu|Ok|1)7KwLWzb<qdNh17#co?XrNFxX@rWgJp9@
zAX9i6JAd!jmUBycw~OE!cQ0hTBRZKc#<iI8Iu=8=XW<#&=frvMtH07O;U17QGFp3q
z#+$e%9{BD4ULpkkYDB}<>}Lo?(x+zsFR(;fiZN4+<s5yJ+oc0eT7OI|g^c?=y~~GI
z*$*MBo00}-LzOc@mgKvuqtXJj{6qu@C0-_WlZ=<&JR;uTr_(r^RtYc2pF0~uS`jK@
z$UPw!)d-${D%5jrZI~xneO~8c+2nsaeD9?BzapsQHhDpfj`1yB*yXlAPZVm2)fHiQ
zt9QBWB1uf|rS(UQE4l1hN<J}Ln$in81!aPQr*zlx!c0kNWC;|OlOLjv;^<yFPFu2v
z^?6K1@~A%;qWv!*pY?-``o%pu;PrQ#Y;0I;Vc@1}9Q9AFW|L*;2t$^oV<o>abh1=$
zxtoo**aC$YC#+gvYmSCe)7+*xQhN`qu+eyV{GFo|(w0Z*<r>uX03dSY7pcNdidE5u
zl+$4@di0<QduvuY=!k9^;5-;26g^Sh<(p@JD<5XXTNX*VUojW=FRm|10O;GMKbcL7
zyuG-0lKtN+bC;acJ-q4@dz8_9vERm9>O~<UdP`qzcaB37fVuTAw0>>(-97i|b)wj{
zwl_^@5Oam^OhvJSV(0pM7RB^k$U;S8s?3-83f$vf&gF)0nE0g?)1B2N$L$cYQDQn4
zK7$n{Y)jfFX4e>z-()wVXv1&LNEw$Glzed;h2f*O;1}}`r-0G2$OT9zv|z9HQ{UgS
zET^`L$wK7%M9WHReK$=FD&s(4W|uk)g|G2(G-8HCDOP{oy-%mp_h#)b`uMiIP4ssi
zN*3pA#aFYYWkCx96Y>oixY6kGP64=gOz1(bYl5j_2Kx8}C0jML!w#P8O-q%{FGYtL
zwDXDe(tLAFmtwRR!i*JWnXwl6!hgeT<+FSiOg}UtyiY;CATGms<V^7C3E6;m#?j@N
z#_oeVbygCC$4rZ>pHBEU)Q&n_|5QryU<7GdHv{G^&Drl*+T6Ok1W$UQQ(Pu$>6|4<
z%*suauwav!xJx;W7a6E*D1|Pp+>TOv8Ej&AEi;oM*n<D|c!Zvz2aeGftPlKE^n^sT
z^l7GIv6e}rYA;>to(UO?327sZqg@oCMs956hnTwl<*`yNVS|YYWkdG$Va!h{Ki<<~
z9YeXeMtMpx$A7$i9hLdYIe1Ql8?-W&{qn&B!oc{w8EPAzt`>;cy2P!f=wn<eK)WCn
z{<ri2Upii$2luPQk5Y2kyQj>x5+yBCEQ*Z;=}gvaKYC@dKb4(z^n8Q+9GCmVIq<Du
z=9%=M1^PLP<P9Z;mcrfDzYhb^!U4KMfS=PG5!aNeboz#=?iFlO0!E?l+|mE+jeIay
zNKrlYjT}E|PRI6hGb+Pt%JGEQm<dqp!k&mWICD?M3e~Fozp@k@g3DF(F{0%zroSf2
zOD3yJc<tTE=f&;mOt{Y#x6dU>Cj!gSOEFyTx`wIf=~@L_RN{p7)H2}%dt|Mqk^cb|
z?fL2MDJx<?vc76y?^{bBxd$hVykZGV=KYxK^1l4-UT~-eUz}knpZ?8>u$6O>O0oS3
zlN8Xy9cI5_+6li>$7ATQzDup(A=%%|QpiW&f))Npb1}v87x}ihc!RdY)%%M~TgQgp
zF43Hy|G>=|P0A!X5{1R_-S_JspH{D@Ot504z2nBoNPx}EOXFFVke+DG$fkkSaV_hR
zO9E7|U5}R;`Mj9OKXRzpr!L_G?+}q(HuGSa<Xt{G2nD5Bh5bl0(*wbTvfWAF;}se?
zqHGWw?SRMW!am;7Q}^8;8|W>xmUhw%v*Z*&`5(~`&)oJhlyKVWCYvT!q`k1Hrmc2^
z_8P|)Fy0m-cz#DsWOBbPe*NQ;PWn+9fS!4)9Iv9a(68*4zs}!a>}sSNs-S5=kZ4#P
zDNOs~Up#Fdz|@hO9pS!<%&be__24m+F<0fKu<$W6Q>UM?96aCXoZU%yQ|TCxN?8_}
zfO!8;tPWLP8GeSi!!ulP?RX!zw8b;*6;107>G>cZ`8R&<aR99!kNj`#A5)!%A3<>9
z_MRxdh_s_0vZEnRT1b{yYt#TZ3!3js818tJSht}(ycraa3Tg%=DH!tDTyT1%#a5CT
zETK8sOB{nlJ&fja_CiK<^Z8&Oj)zFycrHEh4pX4iHDqGVsF}C_ISq>d2?mbC2Kz}v
zUS}0;X4(te0SVQFCSf?|Z8xR)Y$F!Dl$^GX&GdHi9h^96Zo;^sVi3YstK$l{c|kbz
zTsm29s}s9kWoAn%IA-Gb@iBV$!BZT%CjB3!fR|w2)8Ic5CFw<64y2ilT#e9vJxV6H
zVa)AJW*_5I{?9%I#T|bL&2(nA)&eR>xe~YOe980&kK$emooo^KvK*U)Tl&V=91a__
zFIO-XR1QwmlAw#^%kG<cjggN<$C@RD8)>GWnlGnnSyo-;xQg)x$)Z*%J#ZGp2Blln
zX0GD`6g}0@t+j&>S_JNOuupb|V16sh`;<F#Q8QLYZ4uJ$a%_2Z0C>IxeKrG+D6`SH
z8;^R4)we+R_VH^U2*b%t@+uy!d5VT)x%B-q32A4=-fi(wM|+IkBq8Y4abm>QHXh^4
zXu|PdF}B3ldY%NHWkrQ0ciS3&gybN$EUIpC={DXsuIf)8I-WlS*=z4!Q_f3#M*f;*
z)Q<1^v+aW=mn3$LzO?t-o9-OQjjT^Yci7(T92?deKZA7+&__ph5B6{Ne=`||i!>{q
zE$#=k5m2->q}&Ok25*j%CG8~+lL`SeTR0Cmh>j_ku{cFnTIKJBuc%H5x2r{n0Z`Cj
zAo)y@0M`N$rjBYkYI<_TDDB6cgK`A8AB2zMXw;><_n{T2trk6ZpfAd&&2GH^Uw=7A
zEsgK|S_u-e8t-N+7s$_9MXV^fkUnl61@(U*i$C%y<e0dMq6%AvtiNQ?P>wh1KQ_z3
zu#@$LeCtN->eb$6fr=V7jtKcM!>({_^k4M{$B+hDUX}1mqwIU>p6!pxG%BMzLyo}>
z2_b1IGzImXqQ3UfgEgX#MtSR^`6lJ$Bl@nZYdNl6y7|!ADj*B*`g95lJ52wVxd3T?
z5%D<9s)Q7$5JLmln9=s642=wSKmJ+CqK)#+U~h8ftrRh%6>$nM8?1POkIaqPlmGz1
zm*w*bxbE#YBbb4Rh0V0X>i_OnsD_XQn<QEK3T8Fy5M-~&UCOhTyWjJK#GN;WokOca
zV&=}V8Y;DMd-E}lV&S95be-ZX-4|PQ5zH$GeW)f207i#)go(8R(mVRfAi7{>d%UuQ
zqsmR<@wH*Rp^TZ2%*@~Tr)tyT^=oprLZT67MpV(&eJ=DtEb^)FLe{VSO77}E(8l2%
z2W$GE9jdU>TJOpNjK2M(d2kntp9f5fx9?I_k?C1X@z$pT9+nl@^^HDEBeI#6x2XM2
zc5da=wAn3W*oJ(7;)d~S8vhopu!E1;;3?oMDyG0{<CRpIep^h>WL^3Q(k*TT#8N93
zmb=}<89liD40PAGBGm3iAntm^8Jo*ailh*YurQe2!#w1OsLPdi5W;VvVa9A@ARLYS
z@hQ^s9nxr3aE&N3GrNO#FE2O@8?>XMjw8+Dg5$l9&JR`<x<3Cx(CloXj^SOw@uVHM
z;#t9<hQpu2p=`umIQc4!_-uObyF{^)(xI410h3U;8Xa+0Bia)+-VR!;_{n58r30Te
z?E^VeU`z+d4C2*q-H+0&4R};Nj)d)qD}C?lbH&0{o3-n!-X`(=?ev4dh%r;JU4_wn
zD&u;}CAK-Ey<d@4KK-RbGMZ`ppo@3Qu^Nl<PmNZpoQMwLsdSdTOCETQ6B9Jc{3Cu`
z+3I({2KC(Te@O>O$;gCh!Gbj^{R=H2x>*8T!>83xnx&sEqwpplC$;{A08+%^gyIz~
zD`<CZ#q=&V9-Y02{+t*9VdZw)5PDYnjCf7-*=Lq5wL|4oWm^w(pGvkDI57&L^J`+S
zi_}~{-@Oh}!~}bUxFL{_?2)#oTN{{R($0kq(~vy{z7=a~^f180`QU=|^RCRT9l1sq
z64)Q9nP@T=_FK|d0V*DS4*{*zDi;N~*X-3^`;<u%2$>pq;b93Tn(XnV7Zthxt*4OW
z(e)zwwUIN1)J%NPGdgM9a*_<#T)U3~dF*7HC#;YJ`BBI6*kg$a>fb5^|ND;<7wLAE
zxW5Nct?UPFv0yBvNUSAAo9G8tx%x&NfAP>TO+w$L;)n0P-gcHueW$b_n3Nw*ku+Gs
z^k(eACi3+kYxSQ^@~H&2e6Tro<IH&~bK#On2+Fq=`;RFR-KtxMYk-E9N~fBVS<jmT
z9If&soi~+oZAEAsGGDE!{;{@{P~~4_>+P9%YWKI<=NZHOKP~WONjGe8k5kl*&sy%!
z>L)KkFhv*uH64WD#B#J3to3ll3LrX}JGAN7a@P&0n&hmMZ~Sp7uP?0KRvvbnA!N>M
zFoZLd&_Kb|<a)3jn_SIVR<?^Jvvs3G;imd|POtA_L)4mT=?C*LbByDpl<%f}4#D3K
z-Bvq4e$&F}L_d*U;|Up6&RgJcf!+5LgX>0aldGh)KMmke*Ff3Zt#nZ)#2fD9=lnt6
zo{86)Hhlx!4;C-R(1Lur86uObt4lnlUei=i)7qv<x=wY@4x&eypN!5M+tD!unKz1N
z)4g<o{KXU)itT?#Mkf|z?)N+fkD9`OyJzc#CJv_o!4hW<qT}m2ky&B~Xp_KHg33Vb
z3-<tWNBo;@=ZSY#{RVdAA{w-V(jBj#U>?bF5j#O^<QR!M^hx)7GR0oN;%yfCWF1SV
zVaTHkGzV;yh_;RgeJ4?T?)~VvE}PQeI#LE+a{Ibidx_1WC&+20YQx-zAJ0Xos+lfU
z2Sw9|9ubW?qM$$RZ9nGIPrHFYPmSdj&4Q>0id(o=r_+6MfR2-&1@bX`Qo0*W?c;r-
z=j!m<nIid?Rpg>l!zk*(?9xzTx37yn8#&&uW#(T}BH>wVRKEX;%6Xxi*ymd*^mv=b
zZLEAF$m_~QD^HED2AD%rXrFG^<vo8-lrh^EK6Zh13&Dz|Ch+p0ln~KkN9pc2%?`SA
z>>98aX!|8Jz)?7ORoPxBLy_>5#IZJUL8M}-Ep{(-tf3!0bN!A=Vyn}znzJ1E=rU`P
zzKb4oZ+o1CEmwiD)NSoGkpKl}#nC}8@Y<N!q7JjnXHMKANB*2C6VOgENi~;Lm@2NI
z&u)(RG48hr^DAk(A%OQOY;2C%r1kYPQLa1^`BLh~Eu|2m_JhsH+kNuM%y*A}s9$1T
zqazsie6gC@Ie$p<P6dC2oRcl0GabP#`Fz#+3rEC!nXi<cv80&Ww3~}cski)<sAWFV
z%?1zRAoJtiNGPmoTb@1>)<IqOqZ4XAsfJsySIxECpTIu;j$;{CtLKE#gMc?@5x64b
zeed-KY_J!_(y7d|mzq}{3-HkGjp91X@gYhP&5uYei|w6IV8igaX;0Yf-$gvQjBcFl
zOG|^1jQ?Qw-hEwu6i6GP!!R$+G`GE;TEipXOM-=s?=_6rvW;vH(J)R7J1H3ovQHZA
z$O~!1&P7n?p?|aU&pO6TYm*n!pPOJ;Il}8+vxV)&QPxjXY73Fwo$VOF<ImWlATARx
zUJoPHI2!4-FX05=dX`f8mT;qLF=;KmMxyxEBFV$GJAS{9Ym-ZA69G*OBjRhS92`b-
zZQnwnXRX9^Gy9Rq$rQTRQ)E7iN1VV948q@Z!S2KPmm`hF#6wnJ!B-2XL3taEq)Gdn
z7>_HR*Fwg9qGYuPyCTuw{RK&6AgHY&1iNIX_<dCtU|3l>ulcRc>n3yb99<Hv!&1Q}
z&A%Bs;a6ffeHQz&4$HAav0qxH8jKG#DK!)aW|~laImEe*t<-(KylL+~`fdLFJu=@-
z*TIky@d-_2`uwZdmT4rtxlpm|kMQc=^bn8^xM=y~rg&qC+OOJ=pBpc{_E?R{{RvTW
zV}lW#$8j3l!Kc44lH~Ysm>>wpJ1c@!Ya1#QKfE}`-n5%kjJ2OIEZzt*xgwNGVV*J$
zDg9{1wUtJDF&-T{k!<ZGydRVu1EH@NUmasL<JNG|Q<rtRIPxnU0<s<?>eF2L9FEge
zPUlm`g=gvMd%?}Wbg;n=zaE+kOQ_Ey2iM@wI)N^s9lQES8$jQ?c~Y@*6CLo|9)BLM
z+uuvKx7j+SvBT_20MXop#r6W<C%&TT5uyDF<o=BZ_J9YOpg>9z9}$t_)PI7vjo>6-
zFDLZ=Jpkk0j)~ELa%V=LHM9L0;QK9mWM(hq|DKWI7TC>bTL4jt4e-eNH@it>+M3aD
zvb%_o`qBvR(qZ;t=eOIfI^ULRt4bAHSu-Fb&*{2M37HV42nQJV@~3*z`UrA!l_myU
zkLKAEYlDoW0aJ_EQb!-*`VJfR=62|~%YHL}NU3##xW_b5>h522f%F8-p5lXRQ=_)c
zOymLuEHiqA&a<iBt6k`d%_ucBcmC-d@)o>yaAs3~RlwrU6T1Ma4%@|J%I}hPe)wGl
zGVK;p%lj(GX4f~}e+#dv4}SxUfxK+kNpcYlLflL!UM-dxT|CeaRc)v-(9zU*BEOQy
zAWlLoUWkMCQ4388n_;Fef8rdgbIezJ+#JBJp?(W4VujJQagAwz=U3J?*8UERtWAvl
z{WC*`087}r-Gz1CtBrI193GKyXF^&V#pveEjU78*<6VnidNeR>%zOFY5`i?D!WgUt
z=J{8vcb?u8uI41_UvHmXu?Tt0%?=sMSKmg6yT~YTKNqQ5&MWxz_CR}X=@$_+7CgK;
zP=$=2Tn8DytS8a=q{!GAY%47-blz8rAJH52<T!6(!QuvbLKeg~0_*f!6Kxur*#S<T
zO?$4I>aZdHHP>)@z4QV-DzZ&_J~s-OKTLB!)V@aNQHI?f)>$l2slA+DK5R(>Fgn-O
zUc0&Vg1WsL8EFfp-?Eogfc!?zZRPxGE7}mFBGATWLAW!lCGf%YZ=Qut-3HP+JSXES
z-RGqP_c--9Ctf2M+G;}vytdb$%-S+@{PTJqVtUbM0`@Ad<fPoaOF-b`NWR_q)GM)h
zMz=0O^@ViygQF1kM<;22<kUchoGg~K(K|!GbXajV`SJME)aZ82>^25_d)tQ_<N9?v
z2eNV3nb@N>r20^G)lxv%8;(SOUBA3v2TqpTCtsuaq2}MV%_AB3y3&3CE#LCLe7^Pm
zmitK`f3by5pQR3IeI)<$QL=PP?b?bm4c2C1KG{x2vMutdiH+HbnJFg61tTT2=<{Tv
zzi452kNH9uu7VY-F)HQ5rUGYR*J9@*-{|a|oy-2BhzDOzqh17Spx!{x>5T~)`cQ7L
zUuJkcp*kl4;urE8vr1y}=z`M<bob0>dkVPEItca}7G420d<tKwg`avPP7Y(Yac4-=
zsB^u$Ig=Ai_c3ixY49IR9me*{rJO!0!7Fd3t9eK2B3dF+hTDy)4|Fc<Eh0D*b6aAB
zG}pgVfI$E4ecD8>s-!_d&9lu}v*<qnsi0{<In3RLPkK0eDJnEK^H&4Gcp`I11soYW
zs1(Et6B^L%r(2yHyRZIOfKXM3GXaSR3GZx>UmuTP1xZw5>fkzvv;2q5=5j>3PeMl~
zB*XEJFKZPnbhJG&>eWeKxw>lXBWAdoL37oT2;;g5>w2;UkgJZH3$6=Tgyys*Xn)pW
zlY{Q3-vS#}?B4!sHOiQTPzsjR7l-%)JNJ<Yd$%d^6%;Y$M!?>NeK&)CsR2pO>ioFN
zS<f%6AV?$pq2ShSXJBG4#C8SXv@rCO@7baT@chPIY2N+;@8Y`_3%TAGybept?H4`{
zcnv6Ov|M?lylc&G=acq+A?7+>IBxEAm}$G)ze@-Mru1l0w5(e|ME|7xCK!CRae<H(
z;m!)(1^ZW3ZK%Jmhl8@{$`htm91-a%aE{=T5)$AFcxR^ajt3$oohh|2JFNJ|zH56Z
zea{6Em|gQ0W^QhE+Sh?t+|}4nQLH9`tTxBzspI(T+t<g*bm{ndUe-19Un?7@qJK=z
z#6X3ZbC(xhvlt>6*9*3f+_UL2;$1@C9o|K5+qd84!PJbmRN3N8b!qwCTeE&MHK3NK
z)?~SokS{!uYn7sc8l|6Q>Z5AkQ_cKwn{El(;PKQRZ@Q%fvXwP=dbusN+VS~L*aJy?
z#o4J?yGuHZ(E1z^m)gb|r;|mx6-}+LAoJxExaP(WwP!cChQa}fvGtBulG&PaQ^+O^
zJplh}0e<9mt6wSg_q{`v$n1?07K^mrftdzBxTl=BACdgD2+%Ej&z6k44q-#X`X18#
zTMf?3_H{p5FBbnsgakRugIs}W%0f{8<Z4q%sN)VMKF>6OK^Zn+bH~5$Y*3rgAL^Ob
z@*=1GCe~9$%Brg~EEO85qUYBP{(W^c2RfEzu@u~l_}LOKI!Nj2uqjA6LG7y7gNc*6
z>J-sK0+!o{(;`_3sMjT4tWWJvfiP2KIhh$z6cqG18{?ZNA!(M9Q&HX0k8dl)oc4<`
zA;9&zh!Lh8foj&T>l0&~6%|^HM+R<AQfcvL|Lg!2<+mkfn?xE-td)pgaNpmzu$)aN
z?i~)=Mubf&UEKLylEm(QCyE!MUrI(WI={Ut<pGz&`(O&2_-4PUphBX(RQltP<z<l^
zoYoF~20|qoEr#15N<fxxV~j#(JAr4Kp2V~JTHmjpkOh@CHzm9!GlfX`ynQg~uA=`q
zr!M~{qTg2WyJ|P+V0z@Z4RNyJ+En}I*;)-`k)f9LHK*TIUOmV1lXa}AC~#rt_wS#j
z2$J^m^Z=1A6=`R!&mq7hyWe$76~RDF#r2UfJv-IvWp?siH)R1Pb1z^DG23;$kO)6e
zi*3TD>3S6jzgX5MT1icv8d2Q9$R@prRZ@)3sJ+gKJt=Nq3jMb=B@-IbSz5|BTH=gV
zY}>{o0$jiR&S{#130cI7zG}w6;9OCWm(fMP=+InF^vh;a0pYXflf0F=dl3)7OUDZC
zOh{#z*9n>*W#ds5QLzT_n7~)#3YC1is)%;so?cD$Q4wlVK1@wT8g0&NxrR*eT7o{I
z>s!R<h><bg&li1N!7~w_uuq*5@cem_8!8YO$Kd8Q{uAsF>`X`Qw0$p-tPR*+n=Og$
z0Ie6w^4`%<)i$XO9FpN>4%3?^*mba47#KG<(9gMejU1HSh%}`vDW2TD?{?vQEq%Me
zz^P+NzVH_^#E1N;Hf=e`W8C<#u4oH(-S6XYOVP%XhNIJVI8To2Ud}F5dXul1RzsQ?
z#{s1Eu3c+hs#Use%d0j(GNaF0Itg*cp+#_@1_SbP%Y;4QjWl^id?#Wq>ECHbe4XkC
znU`;Zn(G6)-=os{{M<NVU@f4<UF~F0>-xsXX@Wi*Uej#_5bW;1VxIwmY@J=+@01b=
zC*MFeJ$H`v;TjKqZ4L&K^V5T*ji|Y3KSogF`gorYsu&p0thWiexl}jCyCy`OHrgJ3
zuXOB|EqxyX@PzrIvcCZQ0@?ycTm40j3QUQAvK%|=*m~ULbX`#|%#1i7c1qX9?`(z^
zD&E75!!u__4(CpP+*RNc&qS0DykSUr^DIsF=Y~uGU1hspyT%#y)avB;RG&lE^$IdI
zwB=MH!;$;0p{(`*A&OcWfTKey4I3Xi6>S0K7A)Fg7w|k#s6P^^b5J~P@Hp&uwIzV^
zb!<Q|tLW-3axY(VmTiUCvwDlIN-YOI8A?>88Lw{+Rm!ElCx(u*3Bs2P3_Z*htYlQI
zvJ4_rV<Vg_&4U-Hl;h_n@}?6?JKA`pnKCB#?%Tvtsed-$t;mgXmf7#}yM7480(h}W
z#};>LPGPv>kMdtJW<34AvUiwfq-#Q*lV<;h6zAS|j<Mi*Wkdenp~#Z4vAfNIE=pIo
zbSdgF|8eKJF}@B!<l8lovC)@=bVRCo>NikD>|n@7c)%CBu@6y8`(ECRHej`sRO5z|
z%k{(Swd*1DykZtKWKZ}|CN0A+kQa_ZqcE9l4HMWWW;%BnAbOSEwMg)Fr_w~GDoPdc
z`o@<`?ztf+GC9hT9>(1Vo{~G4EkE4=KEz#6lpL)iIyCNEzJ)%*-As3LYu=63xs`7J
zme4>R>&;sp`+@z=rsSg4Pu$a)Lz+7F!Bd$_<g;{C43>uE<8_3<S8av9ylzPk9c`zk
z$>qA5SwPC!e>vJYukK>^3Llg&amss<4NP60t*}G^!W6fuGQ6LQy!xgjD#*TP9J*j?
z<16&;_;X5!qq@QsdV{Cr#7gmLZf=<TaQQa~i?oF=_Q1kXxU?OG-XJ{}D4a3ESaTO)
zN<#yiQTGE&2FjeY_q1|xB+$o!C4IK)&z*LL1(|)f)iK$zneUEb10J2n8Ca)~!pore
zmb70N`}*Cj63gIsZXfi4UJ(T*PZVO^OjPEPKWN)0KGP$#4@c50`RMGED;*qj5!|B%
zI}vwly;rvau+N=rdQ-9tN}cP{edp|TS<wo_j43I8UdV%l${@vJC~T<z(ZYi3D~%rt
z#6*HR9awtu07OxiQ*_^+`B${j^gW9j!l!N*Z7#uC*=$<Td=V=D7Ga@(x11`f$(DhX
zp^sL;8n>Aim$7pg>GLF7V4Tc3-E&wxHvqi{qn{-7*y+=s`9dYU{GFNSs^o$pWw9nA
zq^-TmJ@_~73?xfk;6NomC0e=IdhTEtirxcmYRK<tkW(AtJEfiG%La@vg7=w(>7A{0
z;;zItX2C4e9T9Ks`?p0bc-KI`(u~JcnU6{Ey`6}f{&X=E_~A8sxtUQt1-}e}iMV<2
z4_vJNB)Jth`vw3RXqSprvHYoV$ojDV)mFMxX$X^E3Tr*{lq<^gv90GS2!_g9AJUl2
zM7}NBz@h=W8d13OQq#8kwmYr2t&Pr-tZas@#M(}J_Pw)6lMR^gO0ovN$joJrpWJm-
z2@azMzYK<nXp*+_^EKP74oFbGtS@6BEzQ&dViqi~&41|kYUDSSvJLDmuWbAom#6gH
z$Og{GM54I~nVj(R7kg;h@8~(Um!C9G)~iGue>37(ZpGV2I94N4_k#u+(Jp32i}61J
zx@FeB`KzA@RyHgCCb?_a&bPu#@tpeCNB$n<lUopd-w?MA6C%IF`V1NR{t!)Yyk+wY
zZ_`ondJT!k$7lQ@>OOiEB={jt*<1RJXLcI`dQ^&-oC}z{R*L0^g*u*J2Ev*<VIkI7
zE1Ms57gTsyzJ5{nao;vsDTr5<)(<dXcrwmzvogBzc5c+?ec)p17EG3bF1dJl+8$o8
zvs;jVn@aOD@k7DcW_7Rp8|3}pjL4#n)kKJr)pG`6767lG=8~@s02hK6h6NF6smQPE
z<5SUpS$`(w|CuQ+@87N{s}8^G_qt+W`|!F1(c!;6C(GAcE(B8A=ME1Js1q$>Ql@-f
zn#eKQ02C<h2d&S=Ooq-ul?kG1d!<@_G#4=inwrZ+;0BrG=vk<I_1{PJ0{<oH|FlwN
zk$ZVuea**{;w~scKR0Ztm{Y~znXHJFdMM)M>ja*;KO7xYEQ$v4Fvk7paKAWS<(=ez
z565>UVnD1d&Z@jgDfr$b8L8!_ztN4vSlRpqTJ659C&b^^Ambzm=*Gn6+qXFBzPnGU
z?n0Mc^iElN=-N*G?M!d}%ja+H<o$Wl_h-2xcUvNV8~$jXJ?DX-*#?_K^I+8boP$dr
zsRxkkTw{IF8_Oasz||tdBGlnu()U@)3X{reM?|(yXC6~=kyRnM#X377BuO^UDK;%_
zdgufusWI8*>`vIwoh`jut$jl;$j&aR495r*{xP)srPKAs(_L_|W(w4)%E8{66z6rQ
z_&DA`S1QZ#&&B0)rxYMZ?Otq!;AltS?cd_a_Jq9fXGnq0>>((uvt<o`l`ERYM*<v5
zcCKmCD&@Nop;u0G;EilEGh6^qg(i-QyknrW_m2D!1<7=nI5ozER2L7ybR-$L6U-eY
zA64%Yn4Y?}9QSE7%1CBEq`iuhh2Q*>{rr%37#e!odp%_JJbn8kbGf$4zvm<=0*K#L
zh{yHzHVNM)LmJ=IEwQ6px~|(D*EgO9w9k$PHx^<^FJnzxrJT;5Bj_hl)cWPUpbfWo
zB5bjJ2a8^+>{}=r3KM;8SL9zi8?aPo!=~3%%Ula{)Tq!k!ph)y7CW_DFzgk3=1$^z
z>kqw@=IKW!?V}PGCQS##!qlz}OCmvno-T;}?Q1Uas)|o&c(f~8rJX~7C9;G6x$*;=
zj-q;bdSYiey|eMJC+Dyou}re+C?DOK1@L|Q{BEdP*aME5IJ^h0xA?N~KkEGV5y0Cc
z9=3+DqM5>-d+%aOm9b(6==p*_mN|o)4Ejfwl5aEZ>0Dd=%_f}~J6orwux*zOlE-;U
zqlW)V(9wh62&eU`bUkdfzfHFa{an>vIq)|ZwkwB_Xj!86EzF%hgbSd8%l*dXWh-g{
zS3~lTvndveU3CugWnP3E$T+vnA8jB6X+t+=X9MG0eNMMYULmI)iy=BX!}!l@ogTM*
z>V4Ijg3+Ib#n7UngOnG{!w1@eeGyjADLH4IHmU+7e4`GJcg-*-YZnx&A>D3(vxPWD
zhp6xFmd&O;2j7*}vpWCn&U0osNuas~TqkI?L>eGoa`OVGX4&{@rUozh9iB9F57cDA
z(b}~<*d}T-BWs<GK#Feak}Od_r!B)(>|#4Bs$(cgh)%ULyert6`T6&p1W2e8Qm)+_
z-oBFd^e&8nV+voYvAeLtVyENxBCjcQYC2`EFGE~qi=5QwEToWlp>pLMsol2`UV>;Y
zD6pvZhgd+D`Bybntdbypoz}K?#1LQ+Gyiv0H`UkIjNx*k6Ptk>Mdkf7CnpqMSDta4
z^WoxV1`EnPjzKvRK}9-j2}9o~1AKN?vc~*NKfl%~^P1AHeajOr`0}#-zNJH)U9e|<
zVollRFb)ebqXS1nsN{ghf#0bH@uONt2WS<&in6`^COF^^ty>W_Oq!1G;6{LL^1<VR
z<e7IxSzNP&4OR)OD+?`S##*adikkWAm^uo!D`nID$3fi1QRyp}0?CR_{vKI@1m%Fq
zo|*Yrp0fL5-XmVi3e?a3txD7XHTPX%O>N!Up=}fpwo(+uL@6R21*H>d0@9>~7C?$r
z=}lSyQ9uzwZvvr6ZwjH8sHg}Cgf0n1dX01hLXv;6-QV+_|N7jVTQcVwWsUXDG2bzm
zbCf{-FjJBGZHS@wlLsFvMn|_t!~hbK&da%f{WoW@#QY<@67p4rug?!BvOYQz-CpG4
zW>T<3aQqeTa*2SWtQenk&m@ETAIVK~y#0c!KR?9dJ7&)8l9z7QR0?jjZx%7|F+i&o
z%dX{z4h=ii))=t%E<nYy#KRtKF<P?&ce?gecoZ#zG0ENn*9)7>h3C8mlhunK7L*{<
z+x<Ld{CQ~bENdHX0IU^{G2lErkG}BL`&}q_q1R|6EcA0bXQQfnCX!@=MW~q9F8&z%
zW@<Pq(GjNRu}HY;7GL%B%iz!mwv)C`&DMSUS4Du`(C!c<9j0Xuf2AyTD>DfuxVk+>
z>Y^pm1<4-COngXJbz?<CR^mB^Q_sJ^B08U%!ly|jNGg+-LVH$O=3(6x2PRADrZ@S(
zF^Wv>+>rEfuGUoa`dWJz`<;3NnV_IjJ=r)rWzq4`o+nNwn`ftJaB`+zGAa4=j7+O)
z_!@gno*vtTs{@xC{&FK-60nTGS)U<DCHO`2j7MAEz2LcySsePlkfGQ?@E6XpceAx$
zp`woL7AVvBCIakcyu}5pb#)n=*@BX8a{i_B8*laoV2R@x_ph8^aRofIG^d6isHIhl
zInfrdV{Y?n^ac<|d~V@#i*6a|84Ye`WX^}71ksMkYm3rSXm%W|WNv>tt_P`o^r$pn
zot`P&xGc##R`jbQxaoX4{Gsx2<wGA+@O!xPjXC;expRzr;7dP8rw6}cdFc9j85^wQ
zjBhdg<+>61JACO(m#3&c<}nR=1~fh6bI>}h&pn-7hSAR7^Q&!d^Y2Jtany0MaEg)%
zk=Qm+UYe?W2Eg^o7tQmueYLjDlii=5*h*29EZbU|WN~JfFo%8-V<qQtEI2_Y4At9<
zX>pVgM}(S)iH|#b<}=_l`dbpjzb*&C^ImE!zIt7-gA$u;$;>i*2oovd>}xy`e8sDy
z7NRHDYeN6xV10$z8Wa(BAWslW;(+wLIH1v~FxIUJBJHE{NI_`Zfo2pyEla0lJDhO2
zReP9zyKd831=G!c>(f)-Kcy$<$@9zqzG5`5bk=65(`gnYD~;c0*}8OPZh76_btUmx
z+N=z&rZmCCN{p{bBsl#p_KeY5U$jWH?8*KX0~VnmSXp}3>b;q;$+v|b6h|Ctisc5W
z%FH-NQt%E@8NLKiI>-k?ba3W!0I}F432q+j3#;;HjG-eC5ve^5C9NGU0{u^GCF1`2
zJderP)z>SKpP`6R;U99X^*th0@(4K@Zc!dlqL*3R^FqZrMNyNX+eNjGM1R)Squ(rB
zJGFIxZ{ZHjsy^<9AJBwiwTSeXT4~Wcarm);*$T3r?v6b{Ho0h`V2xYbCu+YzQ!p6`
zk$uz~H5vuhnAa~W=c`>oXN$#K3*D>{s9+aLMIWELY~rOcG(Bm5PaT;a?7tK69ykbS
zEa&Hu5-~z?rHDjU;lJ^P&qlmgzl>$Ax6wE6%c!sA9S1Cr`IQV+S^td~JByCHd(xMc
zy!<S^-3ObA&h#F<Y(}G7GHWza5|zg>i~X=-D$#>T*<fA5lo_^RJXFap;$U6$u7q{L
z!`s5^>-Khzf64Wqu(qQyyD<V@XS>;u>RLq^aQ79wld0god%fzFYDJXZ_7PfD)@JWr
zCp6S~Ww0%w^3%?nRT*NnLX7S}s$dE?n10`i+hnIX1y$IW&~L?~a?;Ww1EdOh#(=bb
z*Y4u)C$`$fleJHdpDA6B0_)ZlcOBRX%el-wpAJ3##?GKJqupsx1_e1KCnFS5VM_~T
zy%$9vRhFwYLT@WdocMb`fgxO!%Cd`_-v(iH-otG^FY6H4u{DK`d98)fIYmHnWPaTV
z^Gi7`40X`N)Gi*-rs(`t%krE;wu26eklqWOe1{;nY1(Gyd+>?>ZKv2^Gp}dSLFVA;
z<`_rN^|7TGLs~c?2qA%zwsIKvxZ<WJmlTJmWz~27-y4#*e~kBiEzqeRkGMfi7{7|P
z4mftB)FQ{fXG&QL7oQ82h_JcDX=v?!g24ZVqTe1=7zHjGOBwid7R>8@Eq*B7z3AkR
zbu?2{Ynd?L+gy<Wwl<IS993A@&}XrTDJ7yGJ%K)Gq!w3WpJA~YcyaDF@U>`t6Xn%R
z8pt1^pGg&fS8A)YX9uGLuHgFyx~cmyOj*o;7XM6+0N1A+q>pRpPU^(J?tR_YGay<u
zSFpj_^vve1_vxw)?B7*r+G*fdwQ1iqf}LFchsf`EYyM<Mc9SP(TC)>OOwEn?4R2?$
zij_S)5_$gY5oFnfKI$)YUEOVt)2xQ@>|mW4S0m>H#TG72AYLjbW?5fG@E?69EPT%9
z21gf2E?d}M`r4z)w(@HPIIJ@zm+D~Y29PI!uw)A8KNH8upUSi3IFEp`gx-vfNLe-O
zb~BrjM(P9Vt~Nqd@NvVH$!;I}1XTX!T@V@c$J<y$o_b#i>8#9)Xpb9KdVwk1P3TJ9
zPleJe&cY@`3{h7hI2+%0Vp+ceJl_v;#c+l@(4LCJ{K_NULo!VwQp=lKevM3xOr>uw
zrkEIyJ#d1l{gjR^dk&AUs5hHob{G4-Hn-{0TljObCol8AX7Z@iWAX}akO${k0}Pp`
zV^ln7ZxU!DggDc*5_HFkujaT}|C$Q)6%T1(F$eu4t|jt-M#1IyphQ7bAq-0T*7v*>
z36@|ITJSPWeZXy!_3|(RtV=^JPf!2U&$nJ;?<!4rbsh<r1$dTaue2Li^EPt}-<=NZ
zxgN9gE?z7X=jz+zOjFyW*5xRkZzZxYY^}{LIeAc(98ile{Wk<wTS*2*wRx4t(RETa
zH=u$yYtnjB1^3BEE4VXuRO9%MLH9TNJ0nBfw4Y1sUOMgv{S3T!8g@W1Lsk-Vae8&y
z?wX@zt!2@L^B#|v^l3E{X@d-EnS|ziNZD>cdZoPz#!gIq#0G#sVhsOv?&WsUsO<!y
zvJIfw^u827W>4K|hKp(nUb?S2Y~>FFc%$}Eyw2)`=C|d`$Hg>Q=|9F+@1r5=l=ITX
zpG^C<&Tw_1OJK%jKW6AFJH96`mLF>=&YR3;Y2{m#H)pJEdc6CUe)0xoz@oS7qi1<f
z?kRh3dG2=P8WBUL$6BB5yWneq#;^{9(;8;0+{M2+!ZQ``*b;rT1odY06avH)u9v)f
zR~z4ZeXz;^i#AgF?Y=2n{e}`A54K{4QHvTu(1L%yi17zFWCfHSCcySUQkFF_=CN~T
zwu6yy@oH&!mz^0$`_qSiFEa4;mSPQ3n8E3hX?ExOJE~ub7C*er6!|F@!L|IibaXH9
zZ^R;jYnQ!Vf7Z-2(3FsDfnt$~Y#)nmqEE!S-46}d)>8}31S>kIeQkW>xL(aY<RuX|
zO;`*x3nR9r;RN-GeQ3qQg7v~?w&>1$N#wIv#hlGavkA`=21?8>lQ>DzEjlaw&0Nea
zpv%qAV0y843O@~cGe;Y(HYM$RvT7sUxe(}Uevm$EvfJzWz~wUoolYK6^@W$_<k#;_
zH7NY&T&nB2x?iaghLd1W7T2Eus!uxDl^#hA))Wex-gJ8GwXhp-kc(9GU-WnJSJb?~
ze3xTa{oJLAR#$y9zr#`dvMqRlBVC_uaehwR%@aARK%?JYk9p2}Cd;N;L4z5?aV0(1
z(t>A+;c2cYvx>58{3@%h1Fve`_hVO8<9MCGCW4W@tpmbR=jz9K+NVu>j60C&_KN10
zTbcXn+xN|MzxUuTdd(~0I(KSaUaPdo{McjJLCLhCUdABrD<|3qxqn=<X12G7lIN<-
zR9gI!gmU3!8u#3Ap+Q}VyNo6Arm}%CruPePZODbq_gb@@gZv<kZb$0D+kKXnNOP}T
z_CR#UEd$``#(<Ioq?S-47qTe`T2$MT);S;Lzm+Xf;K5(x|0MY{r5SI{lQ<o=cev5M
z7pd<>$37~L{`3rnkqoHFDSKa2*Y~bk(XfTyG~>*hpD@s)d;f+}5DvnQYH=r?KX3gv
z?&JFBeK{-9gJN!_Kx4P1dE4{IM&F%x!-KyK!HdLZImV|(W{=$*lx}5lQfh6fT6kZH
zRv*14t)yB&ueQb%_vf@v!7D`sUHG;+kq-o2qYfBCe1^n*?9Q9Z+%s|H&Fk;t4QV0V
z%-W1T4wN;uFdBdGtN8IEhcVMC$2*sPTG8lZSGeW{<ja9W->yWQtmf}{MX;{FpTPv-
z?f2}t+QfG1nGEsIKQ!+BRD~f1TL{sHKHxAi4o&s(>SIhJ#Bx=Ne!xTF{L%6Ae8#Td
z7qy(3)uJq4Pd6>A&WLu{Hs6-OVDS^+sUMR`JK3KM#2kq)Nh(Nqi?M+vVSeL7Q7KoS
zw}%O_ewV+#u5omlqdh48U^xYLzi1)5?Tz=6d%3nIV;~oG7oRsDKzGPaFt4mkV+z%w
zVS1rRu5f_n`Ul3%^@xS@hU1&^Fhl<O+Y*SthXYnrgl2eooB7H5HsdW#&Kum<QN4kF
zlX?AC1dXJdRi)Eh{=eLDotd4)4$@A7tbopW`=QQBDa>@ltHb^geFeEA(>GZo5BSWE
z_)5o+T0M?&8*+}SVSCiPXKPE_yeEFQ#`cs?=O{AxjZdmxOS@G{f=1<NA&j<;lAI$|
z<VK||80oYHO+uQb<bkW`jtc3&UmC*?n)9aFl*!Nuh2cyL%!sp<UOYkm`JAZ@B<JQA
zD6w1|W%VGe!ktpY#~LO1gknO}+YRKp;ximj9PeA-xa9q9s2q34f;km7npe-s;z;~5
zA1#eH&kOW%eoUy{^dHyw43u>_XT&tPhl%q$C|nrN;^3f#W<cFevbPp516`1|84Tb6
zd8h&&AazNyO#%D9d$o;2zcZ+uVE+QIaedfSe4&$*2S(o8vk|mEzF+y^8_$7d(|+g=
zGatLzuY3|A@T-^hLx3L73wS+C;<l&iJ5-k#<a>2>p$zZ#>Vchl&1dbW0`^Chx8ScL
zVbq@4KYe1f0+MHrgB<1PD>kQdV~Y>D7-qbV*O2PmH_ojzI81AlG1-|i@rzT@%LR%2
zinZ*bk;jpCW-H5aC3wA`#(r~1gbZ79Mu-2$l)+Hy`ZrTv0wJV>`L61zK%btiRh1yY
zRSMAfK-85n&#bbLtWR!?H7FySa!sx2HGrYsi!gP)WuUU}-Pi~d7ml}Cfr(LOv8$y&
zI8}l$ih-pLbxI^fQA!yJib9&54D0ipx31H?(ZUWlagEjgNZ=1(*?QM_?Gb$__QpTS
z*tlhd3#9~)+-jcG(X!t5d^e1c8kjebnjao_KUY^TF4y9$X||Z0c;?8+bfKz-CELHg
zUa@66PSB6oeHL<3gRWiboLD<*G$Jn~Idxc>S^WN*jwy10PVdZq!ZKSK`z6J-BCTKP
zWJnZ*ZT5HEeqj)-$*fjnzw9pO_vO23Qy9TwEpGktr2b7bI%y(kXD)YFs#05(m3}Xz
zEVd@0t4{pRU5>;IhU9MJ2lq*#lrT5SR|S6R(8)28poyOEs;wWdnIjgwzaJCS)%Tgr
z740NZU8C-Zm|sP4t)EydTV!~7A02(a#jR*PGjmKSIap4G`B@!QWfzBXRYuBZJMI&P
zI=kv;(=tG4rGG&UhZ~sp#5@g+DyhryXH+EY)MKvRCqPdo)>AP9U+CmV$Qd5Y*<L>H
z6i+x^psz#+H|vpV)E|iwU3;r@7MK@THUD<%h#vzN%nhgMUY2$Jti_jFkJ^+)@bhU2
zr@l8hGMvAMd7)yKbt!cJKsYQQ{kZB({llc)SG{5KA)<oU&ygzQn#)S++L3iCNRS6q
zir>hI(0~{iB#T;AqeO!yzD~ZjL7|}=bA1E;SFcPIt_Mq~GY0Biu5qXQnvdqTi+b*F
zPVQM*8biEDn)W(!H>)~KRkX~m>f0w}Q{}0J|C`%VAcRc{&fL`}Io0Qe?~ZTP%v(r3
zrWSUxJr;TEEGJNx&yw=KXXZy(mP1ZyYoiq3B9RrLogZA6OX18Vu_khMi9c%|8;9*!
z^K_)9cr7Bl&Ha^>%;;1kM41~%E&drYmB{9o=UcRS8p2eO)x8$FMB}W`J5-=^e<JTH
znu0uF2_dmc63CCnevp;OwNbld^KEqt=7Jv>sL#ei#FSQriz5C!m>pH?PhJ3S?TdMg
znGsvF+`YwS4Qt0MkX|R0jdacEZ&pUl+rB&o(H?zy>}UthM!rn$Ssaeuvy%3|X6~Ue
zAOxqpB@G|bQ*K@0x$Z)FQ8k-bRZjwiWB!u>V;V@3YJb;+)N=MU&t%>WdO&f%y_l?3
zuz1AB&Up94$e4e+-oLD|P9Y102w3?(y(@Kp|EyKxqmZh-Za#f4*18?$`;$K(judQ1
ziuAp{0$xy;n6JV6kX!0AySH*9in*`1Tp4r$p<=@z9NWF`V?QEtQEjqczE0A{*^v6}
zG+B1X#Ij^>E_4GIdy1Y3TreT_Z%7CGbw722g52o-ll+bn)O!V9O?q$44XBIlG45Gl
zoU#%V_)Ts*4T=4nmD8R1(InJXQA2c$Eiucy>?8Ae;th-947<An@1ghV#zFPno1HiN
zxxt+NNBtLrKj9}PHi-6R%9>JbZ*C@i6ROTQ7;L1i#<h6AuX$6bT_=;Dpm#MU=IF`t
z`$teDu&&`vGhpK{2gQ(W$xTj^9~w)}l~x-<-mqBi97m2=BdxUoKZ^%}(2(%E|E^|v
zBupcUw{|CX=5Xe&Bql8I(eLx^&)RYbuY_Exp&Ap2wcBgkuE|llzs7U)%g*d81*UFN
z-u>3~hP}fqSK5f}Gi}|&>}nf*A~Q<;DuM>N_G|^(9lt6Ho9?R7<+j{){V^Kc4Rk&B
zgD1pF+eOTC6O21-mfycQ>umVf{!`Z;4N0bjM%IgssFpbw$2(yg*VJ8o-Wzee$@ZIt
z!PTlvyTyIf_kev2_WqwOg5|n1C9Wyu4hB5u(`?-?qZM4*al8mLX`$=H8E5ujod=MZ
z81hVq;V<U9`@Qwit0V`?YFm=G9TjSzkWJE1D0N#pdxD3k3&ysS_n<4laaL(nGvKro
z%kUl--i7=ZcWhr8<#SC)c&;wktf=RD!J*mT1aBU@P|7~52i~C0*Z81zF;O7pj;&Wh
zO_zn?ed!<OwM?-+Io5FU_E=|`%n^`BdyPi_D&h2pqaKyPo5kjileSf?-$*{hlW8sj
z*I5U)B?q()q-m?KOYIY&p4VOG4|r69=@8<<@X-0ecjKGV-Z!~}G=7UhzF&6a1=t<z
z<i87nkTbO0Qu%&=f1OD)m-k@PN8P)yeRkE}fb3>lhbL|Bv(H;*mgh>JrQ5Q8;`W%Z
zcoTZO_BpCIR<N9B^K+Nu5^U_%i;(J)<WS59<{bGPUhm@BbLtqg{I`1WShee7Ri$6O
zCO_f|^e)ciQYo%=GLZT#RrTM(i<_SE&+59sywvU#8{|8O#-PUY5<uA>*7cSPtr~BN
zDj*hh9T4x8%g~LtAzT;4E29VEhZ_g)#IIkxT^x|0t(43QVBFlak5rrG#!}p6CmX>b
zcbh!R4&p{Vn5;M)b)dYif=et+ebc_|0gu<&SlYbf_fSj`Rk3M!e2+-=R4L32+BK0r
zEcyXE?-H*Jd!j3J)3`BgwO1l2;P#$p)}4vS*vh8bFO87l&!!&?A*H(9cJ;F!O<W?S
z!N)33M@$r1>@uKY{AZta@!h;Ioc;HuLQd^`Ba*XTsEBgtjeAKd3eCJN3@<JiV!uOY
zptKRY(s~F6rDl_%jHQ!cJrbK*Qi!N-(po#+r9O`|A!o((Ij>vI)D<&_kSPW#Jp~Q4
z%aG@=GQrzdV0E}cm>T?LUf!-b_mArC(B1@xyU&Xlk0bM=$J#vOjy^&lJr7a8lF5ES
zH4_SWp#`19+_W?N0<vP7MG12lh*&*cBScaKivU?+Q9u!S2dQ*OQNH>1nZLa~Oz882
zgJm-Fu56Z8%0?^&2aA2X^%wnv=!3|wv;d(<N2(|N%%m+PVfY+j-J=A^g1p#=2MmO9
zPBcXF27py!XK*z<PL$a2|4aa4i<S^}9;k+9<dHya*H&|e1z*tWIvv$KSJuTgr0^3U
zF*pwa{ZFv-vWAT_k;~@@^l?-;mJf_c(@>=5x;WPoiK2XMj+T^_VOHf%5sSq4oqLxe
z2;_JM?<F5^_WQA%MEBk!MgZIb@c8!4mw5A^mpFFFOXw4Wm+)pnoIuUqDs8}2*!}=S
z6yO?9$A#Zutqt+P%^OHpqX0kt`!|}dy-^{6@nJ_pKv;W#=G}+jD6>)fj7vHfgqRfU
zA6@$jf6{poO4%8e0=yUA0kEqNmH`GVGKz(fFs73qe%Pol58kx3U+AkC7mor+W6&Nz
zAegi?pysxnGPOgOII(HlfoqP>R2c~LpcQtsbq%?lgox}ak=Ia7R#ahOm%MUeb!D5U
zGmP{VINk_^HV0~^$G;B-%o}9+=Fdl~a0Zp;H79hls}OyyDbrV?z-K^tqrf6I3T$9N
zAdr(SF#Z`J7v$s*7(fR|0bw`p5%(EzEFe(c1i(<9S_isUM?)9~Yz2YB9q0iK0dqj0
zK7ePd$s@^j3snTc_2cZCQeS?X0vcvJsX*Z{fDHpi<p|@wj>G7A#GhphLAO4%4xdxt
z0%R@zBWt`ni{|_lq~-Oqw&t6<n2x)H|KC&~UO{;RfX@Jzu%3#R<a5GU0r5c%Xc=Bi
zd#Vsd7kE1$Y|5YKojp}G>%t$cA^@xI83LvPUgJO2*XhAOwyx8jnx=F1kADB>gea$Z
z=mx`I;V`>#-NQuRTYH}7SN=6K&qTO)PCCl&`LUQZN3E^T@%-(V@NK+nvehb{&M=>1
zQ~8STESxm(^9!&zojM8BF6$@tcsKJ+>85nNj{wlj5}~amA-wMFVdLuXnjB8Hb>=;>
z`Dm)enwU?IpZnLfp!V`Kjn;O`aOa#iSgin4aKgGKpjiBtlMTTpn5o?C>DgGb9KVq8
z_J7U^#d|IcmDr@eK7>o|Q-+}ePG4~&PRmzVgAdUXd*T)}|FOZN-XJi#v7<WmoAijG
zMDU1!B+*)$*LPm1^PIw07?1}1dDRh+{^<}$ye8{;%<PD@(s1?IQt(Xf+U6726{>~{
zKw$mz;?BxfS8{f^;|u2=fTM>*l@gPLR>!0KC?Q|PRgl&DK;ru6WzSb6kHdboKcrMa
z``&RYZuamq%5MmtQ$UZ=;A)cZ(2Kgu`<qRX6>|`lOs9Se?G`P|I12qiil(C$rKc;u
zFBs~k@f)!E_Fj`YJ7CJ#3rq6=-lWejGecX+XwU5YXy6m7nm{glRO5nkubqi{9QSn6
zb^T)Vgz2TAhgA$f7w=B*X`#okRybllM0EME>hrhf(X{b&Ei(P!zj+gdOxf=zJq-)f
z(>uJx00LcR4?+@-W(Qbed(F~9{X83W>Y`Ysxq54EDG^as?`oG8fy<ydP~EdWgVy@w
zqpWj!Y?300LW7VL1{}V7>07YGi6wza!4!ykx4T@;dxH^Yq+DZjJhazl@%`9NXQ0h0
z%ynj;-oryqsZKEI8hKAoxZ=~T(Y0=x^!OF$6&q^MJs~5Z)L;yGab!BDBTHcjYv=qT
zk3_Nm#*Z%;_Bmvm@YFN(4HHO3Ba{6WvX1UkcL3!|zzM^N8LD!Ig}LIZ<1|hbsEn7~
zYU(Dl_s#NI6P*<>YcHGiz*QiIq*Pe`27$~aU`5Y7j`yi>6tQP%q3QZYR$(b;wS7#T
z<$UH#_hW_QX5j_^)R=>vC>Uw>H}XN9qFSu^2X<k;0md(jtotu-FMYF$0$&EkY{>)a
z72Fqm)8y>o>k^;P^>Kth@h#AH@Vq9&+0Nm~N<I}ZYJdhu+5F_e-?3wPDV_Hgm))hM
zLQR?(@Dv^!Ne%p-%ZjGV7>_(ydcxd#38i_VsV2FP+*s1mxg7%hiZ%s8kE@CNsM}*!
z5LWNd6E3Cx;q56vGX6yvH^$tt5L|ydJ}x2|6?zOvL)&8!mRDcLSCpm-gNLTDG^ZL^
zok3fhFm;afEI4rI=hL4GZTtHa-$2-g)2If8H_)9z6bLQy56tGdokAH1<i>akZy+a&
oQ^*6|!~gSzgK!-G`>7|8Sa~Vj+tln#;K3kGHC@$e<wwu{A2OYM#Q*>R

literal 0
HcmV?d00001

diff --git a/example/ck_tile/15_fused_moe/misc/moe-1.png b/example/ck_tile/15_fused_moe/misc/moe-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..91a1f2d9dde2eb892ab621bb1fdaa9e1f7f23a8a
GIT binary patch
literal 92535
zcmce;WmsEV*Dg#;fffq1xRwIN-5px2xVt;W9fDI@pg?eUDDLjT3Iu`{cMa~ros&M#
z-uvC>T<`by%dgC2Wvw~K9CD9)jG0g+h0o}yM5qV|2<XyM;wlITFYXW!o;SXH4*$ew
zXh|Lc;ps_Q{F54B{vZQIOI4FNpceabnHRxnweuN9PurVC?5m30d8;Y@`A54cjRpSw
z5{><yHr&N9w0wGdiO#M!b7^I1Pv3JJXBcDOU@`j~pT#eyr@O_a0ako0S<TrA5#bN|
z>s3N3$7=BR03loFIT-Dqk(y#B;Mw1!fW21)EPoFWl!Tr^k^UK#&=`UK8GO8W0mT1j
zRFoM2`?Ie9zNWBQ4T$1exW*>+|Nf{G(4=_6?S}sanSdqYpPL2{ArVwo1i;1#{+V5o
zasX@(_5c5i#*9EG(*K=p_S-iQgt`|;XKqNqB=Fx0Q^SOdiRtT|kNz`gKHFY9;hO1x
zFaOB$pD%QM|MSJ)BAP;EHTWpIs$r<v39x$omrOriAOP`yV>M09;KC(@_}3;V2_Zm{
z>R#Ai{BO_u>$bS;2F2p1%%RVYg%Q+z&Da}M?WjkGCLjJxf&T5EM|A@J^K-QxfO)Sc
zQlBQ}u{mo>fiNZl0w+a+)_M9b_X^WJzdzR}yd+@BrZNJxJO|5~(p!LTf(mE8!nY4S
zT4tzK-bi4I+f4+1S%3_Dhks+S8vMO3B}(`_5fF-=5exv9zyLPB<KrW7!e2w!bo8|;
zp4#)s0x^8Ob3cK(5J_S42CBZAP^M9d&IY5}Yuj{M2)-;D71EWn_-?-B5vD!@bn1@?
zOr@YyFA2%HE?zyGLZ?nc6e%V^_@fE|L;;_G_;b0YMFKZU)G_&oQYs_CY7@m-D0C@5
zV!!`NXE+9Ww!4#NAh|J0ZE8p+>%`(FSPU_bA^kPoV0MEPwE?HAO;0YKKR1sUZ>p_X
z7etOT-;4ZB^3Nk?3d(aOC~d4|#^UZlX(_04=_oVnS}w(Oo^09@QD;E+nn+(kfWp0V
zt<?|Z8MD7+OfUqXGBN0W|0z7PK$twcMH#0#reD^wY8JP$MDjnos)We;1IB;u3~lwV
zYv?$2G~W~qw$_)$an+1v!7GT*?${yl<X!0oT0<o@oOBStr3^+OPH;4MA-g=={713k
z%OZOm3H$e_%U@v{IDgMJU6l_I-Y=LSGZLzf@tiVGBKF+3vY1W~|Es!-KqPvSU6*lu
zM5Lso4vAs5g0QmR{zsHZE|jQv>hCJj?H``p6HoQ@O7Yx_EKYL<-qfl3D^>_$k}*Wt
zHJsG4UDb;ghJF^$9SN@$_hd^M7Y%LEM09&x>f+g1M5-T#5r)=E1YNnv#-c*vHu+DK
zw#QP!T*Ud3#&NAVjMQ;afSP~lxl-@g0KojaN!T;VPlE%OsTPUD6p3_)0>hjS2PU3L
zX7iW-@f+F5MsJ{04$otftB3h!UqSd<CZ|fG3yW)I@?50o)JLk+`G#iO(-_7@Hix8c
zqCYUaEZ^_v-?%q+iwlXFn6GnsulRzHTG&F@SxMKdvG{%8NNY?`ecX4w9<&Tbex++B
z1UHs_IU+ClctwpH8U#cQ^G*-KKg#^a>eQSDcnqlHAZr}Lw}oHnu+h}NR4G)=%3)}>
zze^MEF{(}`>GdZvH&K>ID|$cqE)k8up?+&_`zf^r?lqGGkV|#w^3)aq#Xq%fTL_Br
z-?ag4PY#bX|E!=%=B(F3IFXlw>$eR0bIO=Fyg_JOo**=}cC>?vxPE^}Zu%P$Rz4cx
zsS*tt&v*4`;JFGSs7aGQ<2~19xSUojgJjz}A6mZ=%M=qX9!%xc9-`X~Gd<jJP>;3<
zK=j}K(iLPNe$jfoDd>Cr;ek0GXL8pTt4NQVIs4D+brErQB6fA;M`x4vr(=<m#zd2r
z*V{?bRDvrX+x>Z*er1dc6gC?$y?B=MnmXJzkd}_Ny-rml{m#(II{U(RbLfepd!u$k
z)y!qmG>BPwU%6jW%2S^IjHvFdP#d`yiT-1e1M9LiI^3mA%F%ZNiceGU<0Rsx4*3P3
zA$&>8+VyqaG16wiDTOM<T`JDA94Z1Knf7@ff4T}%Bv}tmUSt*1OQAohoT(!APtx_Y
zRA9)fJC*!Q^VE(lly;L1TAPrXg!NnQ9X|&~A&yPrN7G*-mkqrCpQup6n;MY>Ix)*f
z5;rK&LmQi-Wn^x{rkYXa#!smd%u>^%jxcYRd2?#JLszhf7{lLQsrRnRoSj!MUX_}j
z%Vr{Le@PU;BXfP7s=eQ?xAO5Pt)fP9L|XJ#8}@Q&EdHm9-1Q>oVxM~8m3%f%ygUWH
z4bt%vm9NOl6sAhUM<x3TM&#DAGC|;w--`M<&K~X&7Q1_V$QuqV5>LNS@yx)Q9EzV<
zXUx#qisfEScUe{FO41YpT!Y|ShG0#dU<{gk^X2w&Mhc=AJVi~jZW2ZHtzu)xq@+tF
zLd_ULyZ77tl+GEQUJcUP?NEfR0Vyf!jxC-57T1Oonu~_FmISbtzi5<jPu3w17L+g1
zRT-5aACOt5#}>5$SL4(ysCic8mArobPh8Co%N+xlvj@JZL*1N0dm{|`02E*$+hpKT
zF$={?{6@1<iMY_E`mSae5Rpk0<{ZskQ+D7PA>gbB+FVaP!^8bh6T2yvdxLksCB$1H
zyc3Jp0U9ly`eAOY>*H6-DKHGWd!eQXsef44#2mOpv49M#(yswHH*Sz>84oukR5C-~
z>Bl7CvCT5pM&@d5O-qUMWuk|!`@nxwZW_B$D3yaW?ERjcg}(!ru;zUgS>IIa2AG_X
ze2r)KSA*N9rT3>CK~WV_%haqlTNa<hX7L(!lA%WoJ^+c>%nx61f4LV?VJFELfSr7F
zYIuFr5;PdR;Xcy_9+I;yR0zSc{N#K_$wXE?M%wvci{j_f)7gS|@kFB^B9i7Q-i^xr
zs@^nd&W|*pw(O?QsvO{IBp?&*LM2Vy2rKsN|1>$t%b}ffdSZ2|p>3?genC=9{srzz
z-ux8@;Lt#wL>?U8?#ZhcZjY_SJ+j`@Iu6vNO_QTf%oDUE*FWHogsg=dELu3<2(C=W
z*5Ol)y_L)C1WkNNEU%?~Y>=Z^=lLRwvxR4P|Ct&u1qpMr>4u^-J^{zkh9^-|JzW#v
zg%~$n|1u@)UiEsF!V9&TSy_cv+H;_MqNnWeqo@{z=UKAd@rZjFl|qZA^FHswz%r=s
z1GVORY&_`+!=I0iZK-I->3F?;sUD<*?j_xC(8f?uUNuNWLB@x8sH$m(1fq<c0jCu;
zvmoTd-#2&}3xPUyJ6z~)1lo0+0utxv$Q{}|4Y`FMxD1#VGSN4|nHefRehYUOrJRUh
z-EugR3rG#^ib%49I1%1H8lsN37lpnK^HL7kjeRR+ZLx_PTP#rpqgSiIC{b2DR>mFC
zTsma1mo?U_<}Vsks}T))NGTfAM8a1mz*7irLf1<8^cOPtkDKNsV(H?nZP?+$wIWZE
z+_1#UnCo$vM=H*}o<ba}C6>!f(cbra&D?QvEn|O39T*@y-Wd7|1^>6(pu^i=?kBWi
z^FJ16K??wiEB)gq=z)oS%tsRvEwk-4UdedDh5@w;ogv5xpvs<O$iOsRHvu_9O
z8uP+*Z_Sl!-tQoq8Mqk?2Zl(yOvqM{y?0)579>vbAffYem5z<EUADp**QAaz^sz!|
zRN%003y9qj8VAlSBHIx%V#Fi>6;LkHKS_<zRWTmMkYgihQ_Wn;)=mHnhLH}Dr6=OT
zDi18AdIDE-7Ih+eE8it7^C~??TOboSaH0IiQy}vglwhkIhLZr~&_zo~e#)WC(mN1|
zGsU{>>PxGTb9K~7QN)$Fqgv&BkGDhzhwsru-5gnL5ZDT?r_S>)H)~S;ipN^CL;n_5
zeD)PrFTO>YbxsJby|n+g_A(?mTJvzD5e2fxgEPhPbM94ek_J#~hFT*n1!fW|XZ0HD
zqXkJA89x!xog8;vSV5wHm;qiD)$}HYv)GCm3^<HyU$4gQjG5m^?QSi3`jKu><{xK3
zZ;*1)bYt)H&buV2b%o!(qGO9=sM@&*^?4}aAU3DS5qw8{OUFL-OV!`GO&2xw>WxSg
zU9)c>WUDG;KYQfIx@odSgRQrw^Z-YuN9nk?a=)!|Bm9Be({=+k<0+7}x2Mv-7#rGJ
zIT@OFh79j5^lu^rLNiwwA9!LRW~e$BTel6&ZNzk5Uf&>Q<ae9XRysu9GWYA;@UR7k
z-CWq;h%0Mi)=e|R1NPr*R*ug|Y%r?sy)hAlCc&y<^eilL8CXCjwM|~jVWkE2#z3}u
zo$>Kv%fex_{4R;&IA`1viqRONByJ=1;OGCwK1QHZV~9LIr5jEkWt$+xhsDpikLy_Y
zRk0}~;+y!9F0y8)`$>GLOmux}J!ucjmKH=zMMgY?U3Iv)F$MDKcNh!bU;4SRXE0WZ
z!dTG>8YE~?&4^74>6LccIt-zx#1n3!1^IsxR@7Q%?W(z|+u_qry=&mL&XuEBo3^u<
z_M>H=T9*|Vp2ye#aMDKb(JQos2ou?&V4m4y#|*<=dyo@;^mj@RxTn`jyo7tBq8}2+
zgtWx`iFq`qKF?p+-G&>26Z8a(ru|+vR`~Ch=l=XI&>s>rn`^!45^4x$yTRKMqOP?|
z4;y`mu%`9LVnGBprvAaae?-cnPCZy22GYzMnaM}yz?I}Gty9`7vIt6MEg2}uC%wV3
z3Z1NOpRCU=MuL!jBgAnll&nff0<C+4W)vnZ^~k3rr1LcaIq}={aV+627x@fY2|54v
z2a`k&bMu4!9Gj*5O9Bi&&WCN0>>KVhUzQZx$4_$^Vw0|#4ITwMFn)gbmo<%q5u=<v
zc>lmY4$CyKY@Ncs_DBNB(l^(IuYgc0jiWb)P2)!{>_Z1AGMM>mr8V@7-m)i{Zw9Do
zq-I{X6Tj%<Y;4#Oj-jifZSzg1?cZvA{pl1K*&BMr=%x6XTNkJPYk$&H?Qo47+cE#M
ziwIgK`pEfCcNn`3kJfUZhjcH#N7cb1qn8|o461D9vE?b#&9fXvj?(dD%5@$^#ui1R
zxh5RLd2)G0QA_%l2j9G?6q>#%d_SIf<|%NCfTur8%`jr>Gs_{3Z3LPkg69B5DFllE
z$U=wnw=lI@buEWbJ?*fTOLlhMJ|z!la`O_$*ciE>a$tXX6wb7xq#_??nFJ~mc|2;I
zta!>PRi%~>-b!~m$*ZC3SzPrI6^nR@s>9Z1+RC-Eh;VByn=twlILuP|H_Y0LNQ#c4
z!bpjH#4}uE_Psjo5PZ>9<IZWr^Ky<_A=(1O9Kk1pv&A~h1Xni;ff>&%Hj&!%X=Q0J
zaYII_c8FtQ5R-y<ukzN<iG6L2OT4y%ftOZeyCUWiH|7gSYq2T{`u?BJ-W9od8s?e6
z&UEuJnXAe(V#q1KF-bNsGKwm)4co;`{IorVL8@u^>WR`kUVh#^&9*;PMOJ)|x!f`p
zm!)XT9OnSuKn)vpkl@m~Orc_6Vidff5u=b)q|oL<BMKvFWh@jFXB-BoyRMNHa6dDU
zRA*bhMt4!6Y>dGqUHH!V=-z78YGyXha^&4&ZBa-WtmhDyGX_X-`0)%O8=v2|lfeW;
z^s9D239+1LT357&o{5=5%LQY%2=%5oM>DT7*5_5+u$gXEbs)Q%-H;gRhq7npZFyjw
zvD#Qcws9lt6}1Yk{0FRoNtZ$lG@k#*T_}r`XoHQMct_mEVm=7HlHUHjkBj|6)o7%_
z#j<LSz4}MLbV8`iW<s{>yDanLk&Tnz`3%GIEV_P#`Lfy?7>qVFDQe;y?WO{NesQYt
zGP8nCEcHOF{1nJ*(NiaEm3T6`>ia6*tbXY@2d58JCJU{DJw>(ycjy3Zl7?KGxM7KU
z$oJmY+)2>&pn-wAA=)$d%oHCqDm*l|V5%x{d*D4u0#jOu9{uqB33e+mn#a;A3M&Ri
zA@Nx#X@Gc&qI0l_9(?y@S1U84^tjHv;4t{rrxC;GH#5$j@y4Y{aBN@=90Mkm#O!VL
z#VX;et6Z|})uI+HntvAab32OgvN^JmNG(Z3!WA7>i%!Sy!;j8UV@gue;JtEADlaio
zM}F~N*D4pmZcwe3nyI^E?C`+PF}gfHGypV&@xKyHjLkT<HSOANf4XGEl^%CKg(IM9
z;j$wJ@*i~vKAHjy%xMpX#pymgx~1tqu|%m|s;eh+tz7vXTu>Je?Z#cul=d++whH{d
zG9X3V!3HE09yK?|<#)V8quW;+QE+MSbsuRnF0$BudLZ3cYFiMJOI*^k1hp^yu5O_o
z2+Mxz7}eA8)Yeve6*4dF>2iGcK2BPsBKiPx4=AUgtq^R&?dhU&Q)|AINbW{MJQZAG
z4C0~YKHCw>x%|yCMMeMpV+^?^SbW`-{q5$}yIR)G$;80x)*a*S(U}%k!eHm0rtBs@
zo&g*RGrtKp2b}iJx&&6vMdj<RH_eW|m8LXO2#M0<SQ3(j6c`>G#?;x+2~(q)RDVX{
z%|3j`{!UuBo)7p;Y!OaLDMhfQ!c=f&jZ;h6H<ZlWB6P0j-cxRBxm3rm2hd|#xlPe;
z^mQ%A_K2d}Xp{Ta+gpvgt;e}UIZ0`!lQViHR^~hbrkS;6%va8JMYU(lOI^zc0{-$9
z#mZpsy^#qq9x}UhZpV>J)raQ~{9MzQxsf31!ls@sl^~UN$WTVZ+R3!GScZs}jgR$K
z<!9!~)XAB^(Qdk^l{}V37mJ0VoawD&o-yfRy5yF})vpec$1Tp4QnabKM0tI?_i54h
zgKM&_9VD=YF$TBv14;KU_IvnEDztrYQm*(pIPK~Da>>(J>U}vevdTn^md?#Jw37v^
zKYl`Y;lY=zFt>x3L<U#KtX89)$?+|F29dy&#YR=jfU?jIONDlFMkGhJxEW{%!42qz
z2Z-jjmulW_^0Vd_i#XH#!xj~R35Ee*|7I(bQ;k4UbQ3OUhm1qq>CIN=;HNqdB`^DD
zSd+i7$m1YGO!RM0niQt-2yY~)M5_yP(Zh>*MGMqEJTkAB1qJYcd~1W>jZ79+h<MkZ
zUQgAl-X?`nWn!Gx1m#oYXssKWGsj>-v^}4{qd794k4gT#Ux*QpRkKm1_bwzZIkQdp
z6dcL&i{mU@sb0>rGx1}2c3?e;oiaM%O;AOHhJ|3o>F$pY<M9WAc@s04?O}fD=Y8LW
zPit#A#dde%k_{=ueI09SMYj`l)xuhOjBR-K>Pw!3Z3s#NU?y*=$YS3rY()G_Z@f+>
zxhyj(m%<^{4as(8Odl{;e(WI?F|E$iD9WK3NKhrC=%wPq4Y7pWOEp)+R45^8Vv<~}
z1Lp+r<Ww%>Z@Iy9q=aYxO-7Tg9YB>FF5AaIar~nEfU9-8>8tJ*$~e6y8td)twJZm|
zV7)O2czvpM@#yg)T2feg<fWDJQ@C)~zy(hH%TaLe=OynW6C18P{WuG^MaSFVqO=8;
z6+7tu&5oPb%Eu1voKFmqpE(GJireOgmi&l9OZl>r)Rh+Ni${Uc?r`Vkmk|?Qc4ue*
ziAJPy^bR>n=k3?{i__emj?un`6L<gfE&bNp`e^#x%vl{&4TYUj)^q;WD!^k!(fzv0
zhwkH3RFpcaMV6{Dnte>W8`qtdeqx@?_L4)}wCKdC{9;7~V+eG^L*AzKk@qJGUutS(
z0E{S@-Uzf;P}RQN@qCv`fyP|L_%&;Bla@w8{@cM1i7GW%ZbHhMd~=HI&fz(dInp-6
zzK#hbqrla<*4SVm|0>s>?&e44DL-^HIB-=n?F4)sK={ik*7Gc<_>*3^xg--3o$p+<
zS`lNCUR#{^hP-Mt{J=s?Jnz3NVt94@ySaLlXF(C~Rc|TvrNnLrS*x?_GOy&EkKBg{
z6S}U=ssj<A<p=}w!}_tQhDG31-0fPt;uHVguF;wL%BrlhO<UXcV9W#$=@Id7TA|95
zmGAKh*mrnny$^}@y29M&@*2x<6-~2zE$J5gR89QK5sjshZXc60h39f1Bj!1?<=Ag~
zWsj^%K~J|k%Wgi#=_2*F`yzx|D@!q6{DV<?l3Y*hEglNWulAmTtq>(lc29FPFa3m^
z&r8}!ANCY>u&TBuFKiSI?NZr_s>FHyhTAF6z;iwX^75ZM0qA0I7TIl3gFM8i&KB$R
zqqBH*@Q|922Eyp^Fv#=gVsAAkV|I{R<Jz{(Sypc>VNtc-_`=l1_r2wT1Wj;_`s+Wo
zJ^gPT-LnHVOHfg2ikGD<j`mHF3v;=QM5=~tN9tEGjL$4p%VLv#C`IWY9Nu;YmoM{F
z>wYW*FQYgJgHBd^vs96@;Wqq}>?%W6X4Fb!#;q&5qTerS*$e04+T0hj(D*Rzrm0_Z
zhv{wT=lklrzPCbYQ%(!ckC$hYLuA1oWS0xfA6i66n^*;ltd0}Wl{42mj+c4uRx!Ot
zgX*m(S*nynl?1&mmU}6^g=8VTZrgNw@8|>HaaSC+2%lO=`mK3%x8#;=*@}ct&ZJV5
zQiQq=mHgNxKji1RlspphGVi$@4WSJ8%S_}e^e-Iik1oV0JNy!U-z;sHq)e(SE~21A
z=H{|cW-N#i;a0tK?!{^#WX%4ktJi``P#H{D#!DaGhF@3|Cl-WTQnYwefWNXFM50sn
z41H`XPhS@DcJMmDx@t1Nd~z61Be|(mz7@_3<0F)?8G$~U|HWGtco92D`i=GDdQU4H
z9dDX+9Q1G2rQfS#-$m#&Z<qR!#?_uO3rbzP$UNPf7+;gM^2n2I>>6=c-qCG<RlOUG
z<C(G<;-!Aq4_j2f4Ag4c2G&OOo?Je{Lqh*;Op;&QVNc{}$bwF?FYKz?9ewY`bZHO<
zMLQAK?kFN<;6wLwqN6r<e~<IFZkIkeZI3E{BdA~b!w9(d$^G1U(j(4K*l}!b-sFqY
zgp<vsgb&yF08qqp>FY8R(QEse^Ye^CsSch|0#PB-94+?sC?O87#>2(g?X2)83f*NM
z?@6HiXK$`6xJ}+WJvI9pMoId<YqUNttGeup*n1@Oq@=8TDcw}O5`wDiy2U~vf(X<k
zGkfvv{W4kFJfNCKMHMF)9lvKqz3Q`B|IdRfV%Av1D6TIY7^BvFvKr0K&gK?+BtkeA
z`Q%Rd6<}cYv?3{~e5CdC>IRU#u@43C%u=ik#MJ}9Tm<|EH)=3_)>ZX*fdGV8)07DR
z4%W-yaONTN&&(1cp1b>7Z|*2Otw$S-%ZVcOYaM2`*Zecv?d_Nid+DhkhmZRzY}5FL
z(nlrs?$kRl%J-*KibA3xVKNc}t%MmJPbS+5t-6*NaAl?`Tk0@!oUtq>OnOH}(I@${
z_2S-+jHj=$`Nl2dz@LoF{dA?UR6*~|seY86Q8F0hw3iLpzl_?O6_sJVG>BN>_09hN
zKKP+w|H{euVv)pO*s&|U?~_Hv@AU?>>bEmTlpVeqG0@}9M^ix5*FkqiGGd~$`-`Q#
z$rW%Ad0bXI?BsGZctJbWxA{WcNJz1za*Yui@1f8xfcb}^2xA*|??!yNzlAlF>-pqd
z19IM?TUE*PTS7QKAWVz=Qq+*&=)Lh%8u@_6-btmgGe6huvCjnd6|GdkV1ISn?Idk1
z)rDX2`qhJPqg%`~Zwv%#cnzcI_1_gY2dH^GWUthN@A<o=QS-DjPd#pi((KteFxCU^
z2``?HLA8?J2OK_a-Rz-V&!Ef1ExmGl9%e?|&<|K@;xhg0XkjGfL*d^|0;A`2`a>uR
z%{Fsf%`(R^>5BPoD5`Ptv!j3!&DxUfK1fzM;`E79gEepAz@mu8HW3t;Zvv~Qsr+zi
zr@2|pkM^um@gpU7_zg7&zVD7wP!X1m%O@}UoZNbeuRcpw!4b^}>IwQ<2PQVXv}jJc
zuda7E<sYaK3O=Q~R&Hz=Qy1DMmF?MsCi7#_WvN2zjK@X+@SywTcYu0x;%QBNt8nJ#
zQpfJyQJY$sHjxT>cs)BBMsPfhR_)@E{%(1>W04tNM%)Z!_B(g*U?9U(^$=(HzXO)R
zZJ^$*whT49JGECa;p=xHOTW8Ai|bsl5Q@ao0gWHFr0RJL1nXu88YRqkn4#T)T7bC{
z`tVPZTp{;$n+hg-EsDsmt!C<CZ?)e-r{1v|ps#vpiy?nAL}==zzi@xQL|V`OOQlm*
zPek)hW#tIC==|uO@Aq-?8NB9${8z#A4>R1NA_l}a3V`{&Ca6TVuoK41O8gpyCL<2B
zN|CxRN_+0JRB4LoZ;05Sg%oiG4|q8phpx4??G0uvm-%%(2JzSrT&ofo|L;wogLPes
z)SyDd_#F(l%L$)D;c)J}>RZ;#4<r8|!MrKkwl}Rei*_5zyxVn|c15>&<AA=(CI7vZ
z!kls<`!qf^u`9Ji-?)@k+I7YwV)n+h!w1Xnb#@oSL|_zX@0cKuKxvamyHe4>%&;=e
zTCv~iOxJ2{J#JATxKwE8w)y1!GehTj;>s?67dS0T53fJ}Q@Dk{Dd$tmgi+9?);{2z
zO6OHxERkdtnlH=CeZs`Y*uB-(@(!w@g>56^M}Mk6Ic+-SedWzkFKTWxrQ)HUDA7Xv
z*!OM{6>A=E`Sewal<(3(nNw??!_d5s?QuP?^{1ywje<>_gIkKlsxG<tXW&voA18KU
zJz2Qp=YspVzf0h1@HaLX31)EcEWkAX<Nk7nt&PSNy))Djx?X6{`yHzo{9{lD6+m2w
zPJs>E8<@N<39L84pJJab8A*((df@V9U9S1zYJ}X&qhyjrwSX$~{mwGVeiQn&Ex+co
z#>8_t3%cB@>XEsoiPiQkhlv=DA7z*D!+Zpk_z5lg_;3xh%%QzWZ}_tfm<C8c;z#`w
zH3wX<yJnx*Ahl!l%l_GFuEI*2cL@P`YHK~k`M(nmEG#n;<DMk4j6A$;^b$D@h+VpZ
zIoQUc;!eUj7>nit$@L-DxC%Y*MQHzpBL1{96ji8cL)!#9Qx*;JSzp2zU8-nschiZq
ziNM(K)|#JL7S|Lg%9&tY-sGIu`qnVww?(+VIrJj)ks&s20*yvRb5UY>{BFC0knHoU
zqqdpw8H(GO>2pgri3}KqK|A&?iF0$L->JY?flxXrU)*InJv_o)QqLhnDe&x=z)z)|
zK$vxI>uav=F__|qoz=6d%RDpofr=OB_$*=)j}+f36nS(RTpdqWXa;hzaDe!u2NEdo
zFyLl=;C7>oMIZET-B8-`oh_(H$2C>M!e_Fv2LE&WYXZ+rC?SZJ_%^&kT`~nrbzusV
z)gYVwe>ggXse}?M(2b(QN(R1#9JK9Bz7n3%ZirPXuS~zYrOK5970>D1JCC;*zaLrL
z?H5Rw3J*#pOt1j)x)x-kzE<b{X}>}5PE&(CuA{7-;hL0V2BMF6nd>Y^cl65oEhJ4k
zaiKi0h|}P}ZL*&a+MV!m>2=EDP|9sL3Pxe!CXP}i;a>gMUKQiX0REFvs!2mmrLg>f
zsO3-AS+InU?ua~}(Vxe){nW@e0$sl-Zu><eB_wb^z$$!cm#`q-U1Sk}_YVQ4-WS|$
zSt;T(Vv%CY^PW16E(XgmY6ogqsnDE)|I~Z~IeS4EXIHYk=i|<9bp}Sa^k-LaX(+|a
zwBvV99NvNuoX;kPLf!*NwnkMy^Ui6dw6Lg80wSVhY<kz_Q<+{(uLfsTO599s76+=u
z!mPF0wAV5XGGO$t$*FPCK7->e$7A0~{8Z9trr#vbrT0a&!Za9@N#pD0dXZ3>Q)6j@
zL$SA~@$@UlyX0By3^L!A-VF~<!s-v^6vEHPA&pmx!;mO==68Imt<?|X3a&N2Q$JAz
zqp;+smIT7qYc-lIl-}(r(l?O@ov^o8;BfP^m&02?aM3&dPV1jR)%&U>7%sC~kM{P4
zdhXVU2&u(BB$~DPdf0CMYCjx&x+*)&W@BS()EBHnO;AG`k8^2pJ#sa^q8^@W$ZX++
zwI&>myk-t(bxW!bfb_E%<I4<X9cr&_Zt1~p>7~>;3>c$#KJs;jUrqM*9g#zQ3z;VL
zaJd9a`9$n-`W%XPE3Yc0_}c21R<4Reck!=;^trIzqAa(=_QWx<J?_h*3#~<2pc0tJ
z)cM)>1E-<D%_~#QB09L))Axj2a2PnF(#FOOipWo~SLwAW`=M8>mjKgC(PkKi=GF0^
zJ_nY>9kJ7zj#amt*hpwXYvW(LegvCBZpXaCe2FVFB+KcU8doPf6pP!Y761t%7y~oJ
za2(}^0OfiQ@6|vCYLZg68+sl(t!LJE)XXtL4cGLTw@1FwsCL|}ZG;8o9O}fUbu4AS
zqJZrahy03}OWwK|N4GfpEo46uF><U^$+r?&v{Nne=y_G^;#a}P$;-hE9o-3b8L5w2
zX=|z|jXOQNGR)WaKDG9eTA;{zx#Z(NM>E?um2=vnIt9eHV#3llpHidC+UrD?jg)I=
zt4{fR{OVeZYLEVN#wUN=9%Azm5U$lG^uK(Z+I=t|JgKiscCg)09v(2r+b`N8B)6-6
z`a<SZ-d|(^dV!zIXi^4xrxGDC2jIGo%s%Qv+eGSAdn5`(yc-zPWJlx*VwBCu4zI{>
z4W-*5F!Z|~=pxss(Q#g_mf<9*Jk77LSK}aH`O^)B4sS4mH+iX}urKc`#$GyHFMR+H
zLTgNqMN-Z;MGqr189lMRZ9nJ(8=XcoDir34j&oT?uA3*i0W*5`=Tj+0ZeZ`#t#4d3
z6^8e_*wyj8u4l$G1;WhSi3u|qB^Dq8+k}1^2+oH@S_xUGI_uORVtN9!QB7&HIA)<g
zB#)j!?x;5*d%)AZf;l5(y=#aTvu4E^aYX$YWu;zzdWE1T*0+#fC3?+adM|Ybv$GNz
zG-Gm;irc^T5`RYGdQo9R1m-k256rzsE%h}0FFN=nulz>&-Xf>5HK-AVTNfVhes(zd
z`+OC<+K@6m#9yEG-lS(W2rz&r{BQo#V^WG?3%Z+rvY>!&_AL7g-jqu46CJ<2B{*h2
zjJR6Ew<KyE>jr2VTAkN>L4}<6$8P)63i0LRWkOA>6-9NqS6%qLE0?a$T-(oCm1$$X
zquk9f6WiX<Q*Hwe;wW->JNs&NRmtK5nSXIO!LDLeJV_L!!fzu4Gs1(`v8xr+?HIEP
zEPyXB8`<z_IW@T=@OAS8C1Z0_VJ~#hd*S7)7(tptof6N?a()|$lCnj-frDB87j$g)
zR8D=Pby^K!$~Udp%Ib-FXquk)wUuemJ?RH2vG*RBDsSkm)A2O(BhE_}GJX(ot2GP6
z1op-}+rvKVn>nc-=8v$}_z}3rUqaYMdL4q5FkXtG_o6<KnSPRFjz4ofoOw`x>`JM!
z-%x1sLGl|jF!`EiM>f`0a}FS~;9ORvrlmyD(~9k8dmvQm@#dlr-ue{qheY{XrmV{Y
zd8ZLGA8IS>IhOm6sdMg1s<Il7vV;a8^EFfHQwAfKhs4vLxEXL{J^cA73X9B6vrqyb
zKzerN7>i9JjQ##<eOs-$$K^t9zA{(Sm9@?+LQ;!Y!&1y?6L*D74R{ty559%xFYCB-
zLtxefA;7{hYi_NQ;-4kg81|En)dO3uIt^j3FMoze{j#dI#myZDCVfhI4C|?BPlKM-
z4Yi&weZe8cP3k3XtUI}MY^(+A&eQHh)KD9LGAYvOm3g*VlNSy3byaaWRbE*<C>v8P
zvN|B9pV{r*dv2QMrqe7#u5TR66nZ+}JNfqbI$_2y!k(o1^Np^T_xoNdT^h00yNZ=Y
zoD;}Y?IfW9xAC4pg%PMIZs&mW=_S0S^EbQ^5b;k_UngKHrwb4h8AhcN-)~X<Ta3>5
zS|Xtnzy{BGyc);D$_|lwpErn$!h4<wTSOzAbaaRvl#3ltYBB_b$&QT7$2MCd7^pAp
zHV<KfT&9buzHt7d?V8nTVlMd!egA8VAN}PnoaqYLisZvOdkGO6-MR=ekv<7TPb<JI
z+1~s8lyn&W`NCS$u)a@29!iyftGJ&poF4&;{RIr6vRU4giZU;C+~@Tgs`be4tx*RK
za2iOa#2<yVhcxP##}^;}1(@G6{Gm8vQCyoTE;2-l(A3k_kdf<OKNK^DzqTCS-EVT}
zcYzy=_^1jl8VX{9SPUvL(u^GEfkIw(&EBsIU{vGY*qbMs3rx-Yw)5BpnH<x-mfrwv
zx^rs!d*@G&*P=@tuRcE(r(x1n>IFSMax=?yUu3tzO75mUI>6I}wrBt2>xqC@){7Gs
z7kj+3TH;H<+6GpKi5^XQuB!7un8`Vt0d=m@5T*#nZC@v?!+x0}Go~(%V?nVgQTNMm
zp?!N0Jn8erb68?``4Q4Kd=?K3-N!h1@S4or@!qkoQ8m4>e53EZqv7Cu)!B1?P+#Z}
zA4Vm}y+3<M^SOgmb7szdgRJ!_`FCS`Ywp028eF0@zfIn%CtH?!sxU*pg|iPNe!<%W
zslR()9{q=$ckG$HPD9pw6SwlFRJR$mXrL&qe!=h2Ya@i$Ir81QV_RTuESmv|ro-;x
zSJMM24_sti218;k6ZHy{{MQ&nxrj)$y$P)Gk|@#2zLRT_eg?EiPcL$RGax~GMOtY&
zBW8Ru-;O7?pvQ89!(G<1jD8z3%@j-rKtxs%eE%i0Js>`5)WgdBF*Ul?p6dSZeBDF~
z3w<3fWA@EmIqJA!6PcHRI7S%62^LuFDO#rnGg^$)GG>Vq!BpmRv*zB9KWko)Q+D^#
zN8odP@_EU~pfnMI-}d3VVBH|2z<aZyNfYAFwZGC;&bnftnjX9)tEUC@1jjkaA!#Fv
zVS1d^(Iz0{@Ww|jQ%Aec3zl?*aRWe}y`x&V!-S)4aS4^0GiDJvlHUc8lNAp)2Mq_6
zt7#Iv((Ydu{_<o8hkLB+QA-_B#}mK1rOiD`3g9&MSBS%pF2L@OV7HQJ%gmOu*P}nc
zTasa9^#^(OU-i>&ZF#Z3ZDLDXd>KTZJhUb(sX%AnDH)bo0%WuED=Ep_{z4^Qi@jhG
zaNZn%ihpQn((GQ~QT?re1eJ(vxzZs~@nD4XFF+^QGYc(#BC@Alg2VZ%;Hls{uk6#!
zDcm^!&S$r9p8n&1`-!)<&Sn&2C*VkrXZy@C3>lhH;|gswMd2*jkBPpG>@)V}ofGm-
zurRkzOwo+a%-Dyn>C6mEYiMlkmc}<~H@j=#4ew1Elh2v6yPTqt(NLlFn#A`lbmLB4
zv8%UBn%f)4<d)YL)(V{(1QS7-aAmjbMGiFw`Sh$(ep(xW6ysKMR6kpO9xjZ4VZGj3
zQPeS%f>FKHzz(UK8)kXcv_3a;D{tjG@kvRL*7?3<wVNqjr5!ZiQQ8;YE*$mpmCMm^
zyPbY4@{G5U5jyiIpi<cLX4mj@!!VJ?=v2<ixPSBnzgEiC^~#oAZSb^HjaX%8@|5@0
z=(}O_+Th0q8=n~srZypsjHW#^yPLtC=*JFwOlu{cI`wa8SJ-g&c=sMtb+2U>Q@g?R
zBF`^EQ2qD;4W6E}-k`G)ILM22>kQW+W`&=E?Ai5fPtg`O$Q%RO{|PkVd|vyo=uH6u
z&Juxq%QU~Q5scR$Ws$O$TVCaFFz)j0=JPC51^4L*WY<Tl{ju%JiM=NNgfMYB!nts2
z4A`Y3pW`DqA_V4VaP$gtY~9K)f3(8Yn0@V^c12!W-`D`U6SSJoHXu#CD_izvzCEJE
z#P1{5pKq|<VTu{(Rd53)k)!E?F4n*G;29jnwV+_wFHBCaw_Rx<LLct#56LbwiKpA=
z;*ZLSeO~G$I)E6#P`RYH+u(zphJ~)OFSncIP~zoxYEOP0rg45Rf0UReaIiW*=U#p+
zq{cY)W37PMi4KvtTj)jUC>C<JS*>c83PN$E^ZZGonDN5ecUN|2x|1xfj=z{wUkk4f
zS^5TMlrzQFo@!(^-MV!|+PiCb#HsLPwd&Jj-L>$Iyhg9VIkE4MS_V)DkD7bf=p4I?
zvU?@y9hUhCxmr0#P1lpys=1>88@G&3iA`9SQzhNl4M><ZtUv%_%T3mr7`7V}l57?u
z5WL)*1fDd5`o<;TXQk3H=#F3APRtBO8JIv&df9aVt)nlk8Xx+1Ls(QPr<rDqKpltt
zyI*X$iXHl>79w*ne<e}=QUcGFHhQ3c8E2~6Ega;D!rULPs_=I9x^!rbV##2Oeg8u&
z$;2Y+z|wd`3v}_}Q~y@Peyzny^H4gd*%XCUTh5vH+#8zl@Q|6+(SA?yVCBpoGE8As
z19!{6uyVv@q+b2}+B)&w{;Mf}eHA-&Vhpg9@z7B5NVRK+7Y7lSj`7GU7xEPoebfJd
zTyMwUtZO8uSa=$d1nH;eg2xANA2mbWCAQ)qh(D1J15e_cg1pw`o}AsF-Z%i|b0pd7
zg(J|Ak$pL48|-@J!`&<mjER`(4}eXRtOgUXvQI$(5Qz?0;0EtoA;Ftt?8B?@3bj>4
zj2(1>%4dI$C2-9F{P}#WE+94$f*X{gF5$9<<{MSXaxvmM^ocKSrw3%cRrT~hHp7+k
z1jnQPWOnPSt%{D}>ngA(?qjqsYL%W3EiTgsWrlqjgBBbE7KaMNv)#jEmp%vGA+H>w
zEktB|pxc`_hqHeE?|3cEnF*!@n$&2?b#G<?E5g!#eKDgb79Yr=5;rf+K<p0qcTO3)
z@<;EaRH5b2<+{PuHJ1jGO~+>1(9zB0vrQC};<_sFjs2}E<N4Le$@sS`m2{TxSi)np
zZL1%&JEu4+8FPAB>=f%<{PS%=-zfSp;mX@G*75okBB6t3)Ha-5s()tx;~AxD1~3U`
zyvJgYi&oe)r(J(?xie9c^Q%lh4uQ<^PbmLSLQ)I?-%UKC1?A^=M81;f>vIqx1o?yz
zMlJMDSotIcoUCqf_%Gl*(0<x1+JeY(Gp6#+jt*v6gCanJcWz(QDR7>uJPnO<y$)01
zxPCZtMTB_&$c!OoprTjE=0KI<_9$+G=g5!$Lr?dd{eEaT)~jkGAAd8@*7N1$-;ts*
zv9J<?DbV4sZ{IEI^}%lF1n(w?#^#@kNPLF6d0gJv^u}(}5i;J4>z~>&ac6{nVt24%
zerY%LW=RTZy0m^@?mYQal50h-+(mfG<I}#&Yi?*9hgf;j&lLeOQrG)7*rkAwm000}
ze8xC(7SYPm+0D7e%-JcD*eDS5s+pr2!o<4l90T&E^1`xPF21Kye}?^>rw85`@D-Cc
zo9A!ibA%2vFyGke5%3GUo~?PjwX_UwAfB&C`If*ZHZh<MEbg!+nJ(AYy4&hF=G**Y
z%9Zb5NXA2EzoBpQu@Auv6wWR2g1d~T%)8ao|H#>`ig=FUdTu}d0%i2^axQsd=2+M>
zp~h&ja|3vQ`<8}y(%q0X=P9+#G6oj8KT)jrC(c+n2<+}3v6hITjk~eAavB7Af^=DW
zMpkg=SbiVJy|e2VdR<RAm9A$bux0|_2;oHev=)T3!jh`L3q9jKpN)S)66f}iL5JH~
zEQ8+iBVe*L*uTdO)m@SC4XVlV)Xn@8MGt`Mo4)4l5iWlNQA_8fCE}Ooa5c&%Lj0Gu
zJcIUv(=s*tquKOqnIM)^ge+;j{GJc++PF6w&1-(LyUu}MwqIsC-B3pE04?rlU-al>
z%jW!Bsx`~2R`DRabN*x<$+}uS#?_+p^*fW}EL4=<(6w5s{B?eFj-Vl=dqrf|Ib2Wu
z&TWhx99pcWLPL6ZyA?znt2kg#SvUzEV&P79Y_1+wE-_p<btEn)DU%~F=SBLq%l=E!
z@**2<zP|K@m2J|Nk-<In%H?J2Jn0e^$#B>R^~@-3we4244&&O~S@`~(1^er}DRq5d
z(q0Pz>oPsY;v2+|T*$(!0n7gr?;<n_-N1viaR$}m1B?q2N#yv<o~!r|QK5!-=h?PQ
z)Og{&(E=qeHn*K;vFN%)S#VC@gnl1f%Qg+gQU8^fNTqBAA}T@#mtxsnLCq8GAcOG^
z>gRsl<nD7{54u&alOM=2a}RR5IvP2Nju*-nv!3{*`U%)?DhB1+h>WDn#qSEx!52q)
z@)kls%{e!Cy-=N`i5xk=`n6BzKPbFFC4xtQN0An5Edd~p)WS`AU<)v}p`^=hgt1p`
zTF*S&OrMX3Bh)W8AoUj!;Ja&cXQ;BFu1;zi?8eK_bigcyP&>Dq|5%GtHSK_(u#$hK
zHvp#L1ox}(3gyW&lyhX@44nHKPnO|Y_KOn{ak)uNHcHz&n6;%&l1`s{(7e-l92nK;
z0Wc7=8pye^FBev-5JlZuFBf8?e0R+mnVY_(GG*<>g8WEv5qq9$tIsNyMxjGgX)(h^
zl!(}F;fz#pi0J#3*aY58(uqRy8-ADn<ZAR&fY!+L+Y?BJVqas?cUbOxr&nfjt@m;F
zV4sKEYc=u+BTzx^7SZufmI91d`#SWKH)hFl1zL=R=a8f}9WL%FSb-omID;zeRPJsa
z-uwRIe-mA!%k|{q{k3-?*f*Kq%8Ylr0(?c#^fXYsMA```#p9X*;QKv7qO0sD+mh;i
zAp<~<lgY~0_8@}cW!>W3RJ5pjX6&TE|MliI@O`<udFWRzU|+`G#=Z?E8(#o82>N=u
z4$5$+8Q-I&_n9xY;ET9V3&N=%nGY-c?5WJ)1x~ZpS}`%Eh^yX%3E`ugKm3<zcRq5h
zf=6C1%w*D&g}?z9<$p0tbxRPXzm^;I<yax~mo_7|-Wmz_ow@T~qELv!qp%?{)K0|A
zL}tELBCC;W48b`=`+VU7jINWpR1K$R*`fNayc;R4@-bG-6d6VwBO)FGCwY%?8M0Q^
zAOiDaBxw11CpsIv)8gMz72d%C^f-T-KH&FGu%iDO&o*nD_&qv9OPECt4SI+pygQIU
z5DYX=UEX^pOEjK)wrl5aJyI`pI?$9g&CZ9%o<gU2KmCqd)*52Dd}qm59)T}*VetQ9
zi=`ugxE1qlT-@s90#_+f$w_yo8g>>##MtFLoTPYGxt6!__WfpwczAAti=5j7OpFSU
ze)hxG>;)?}Gb#Cf&n8DclAyvbmLP(IHoP32!;m~OBM@$3AWu)OWW;=%cVn^QuM;2%
zpYu$Ge!-ChF4D3q5xhM14}$o!R(yolXz~#=FX8r)j=KCN&3)!v2I)v~@|gpF1pEl#
z-<ATea=_7?Sf~_euV-rd!p6gH-SmFC3o!Kztn2jJ1jMdwYkn06dCjUI7T>V?D9&|K
zF!2(8RPpbC46jnb%Ti{zI5TP0wiaK0D^hw9b-2O%IpEKQH(g!Wbv=YyugVv`H3I$P
ztW+N&JZ2x0T~KryzZBe`vxG2P;@~`DfOSp#nQ1TY?xS|e?K;S&m#)jd!Ajt*$?F6w
zfFz~`zqMb7@H0DPWTPn@AxJ1ZGX?P#r`Y&cK1YjO1H1dv!)KS5zblxUM3mv@n*P1U
z<N?;@q5GfrSUHX1zB6xCIb7tdKz*Za1oz#gBTEAFI@<Ii4<ByyJ3e;6n=(><It09e
zO9)zCK`;jp6Sa_#SbF*L#Kp0y*==80LBUV-*O)VId`=RK+d)``?_s%Z>GAQR-UjVR
zQ4J9+OZhL-p@0HB_y#XAUGFXn#%<eY#oXApz_Ki-<!vP1mw4CUU>P2Jlul&TLU0`Z
z%y9AM^o*|E&00ENWzU5X&*dosML%saO{6KD?iM*2exj0fc`qy`!lNtDO+IZJ4KryJ
zL#x5|Zn3#EH<!l-d#gPAKAqo<iSgiUt=n5toN=s$>ET$hrlzLBb`A>%2Zw~D%!5iD
z56*1<TU9oR9NdkajEjFjl&Xtz%j&YP$1#xEA?(miuf|oLUEny>pg_UEBU}7~xl~X6
z@M$EY-2rrfyBD&iJd9$WxinLd-*GwtuN}Krv1f5)$NiaWK%hy^@z7>gaDy&qUK&gn
z1<oQ9IP|iu2`-<U+<0GQ7UvPLB=q#e2L!NPUQV;IJ@N9+epyPqfBZl}VY1L*he~K`
zXQ$i}OjKdeHlHIFV(9<iHj>Wg<*G%jgc3=@7aJYD3V$6Q3VC1Jo;SwMt9bdsGipTm
zk=VacP`nL@gPU~D`LO{Wiep|qe1rTfDg#EpJ=J0khio{{@_vgqnoow{VbqhX^a9Nf
z+TILkH(Otd&}7&#b^&aj-}L6Qa5%$@liv~#N$@6Jy($}gnEU#sx{`en{#s5b%zhED
zbFvn$tju$HpdAQp)8WkPxL?hg)Ip9Tv<6;Hm+SKJ@wIuiWes5w5*FtK4{$FNa#)lZ
zb}WyKRJiR=Yiny)S6A;`ulI&OK0e}B>V?ms0F$`kMMy8i|B?i9&0z1N$MBHKl^ShE
z+?1aDUzR+C-vz;g(K2JhoPD{^Wc=W@=7(|89PwXfHFkj56kogG29DB0vwKz+0+y4^
z^Tl6Py}E{$TkHdVF)?p?%9an?$(Fm5eOa4fI+w}G;06&N=wkqPgwR2Ggy&HzyvWQ;
z&1T?zSHZ`3;|@=c3o*4pTBSBB4mVxU6lF2EnN8IEkhyvF>Q!i1m`c0Vr(ZW45Ny}g
zqay`L?Su&Yd`|1h{;stSvO%RV(c+#SabjZPcofpSf&v|F?P5+XJwE4S{Z{uiRHv}0
zsL^nI7FS>2a|B?L<iA%Pv;~uq&Fj;QIRo%T1QN(^ikz*;${ap+3|}m!t7I;EnA5BX
z&vsZJDZBvd=4$z!4@BLKq`;c6=+?g2IeD#pw<d#{e^^P<6OkffX}y|M3sqleF0~sT
z2D{ty)rh%Vm8`YFJ!+p?ZYnmmDh3I!h{w_1%^8g~Wt-QaVw)FYP3;*JifO}<c>`@2
z;936h;|F}>Ha0d^L(aBG4M6}lF$h4p;Vq@4_~*}1t2QSmr*9~j3Q|%ZBf}ooRr2~8
ztf#CYu~ewIDz7jxS6V$-=%q-(z})t$qa_Z}Wc>eUx#ne4_l#7Od&_4rq`DXxr=A)k
zXLriEyI(*w6%TL_6t=h8;1rEAlK)Mh@dD;Z$8&sST4+d9$R<ClOB(Z5C99=AB$$Y0
zAT?8S_RK$R#cWIZ)p~8!C3q!vW)<xHFjq=EYTz<!z2a)+UIITdft=~}@r}<)fGy$E
zu?ri#!yz!*vw4<smvp{@j-T~ZHV1yxv6t=84<yCmbzzN9*escr?!{T>1kJIUNW8z@
zw{+P8P8DmuW&+|1SbzNZ(abFG+qZ6T1aYB!S};?RH3+WQ?d|R5x{YAO9BgcC<+tA6
zcge{FYGEan*S#ddKD^up@#xKUr*^Tvgs5wJj8nwiw#oc%C4R?2%$1k~EO!5jP|158
zWR>k;0oS!KL}kq;ysd(rW|H&jSR7n3{pKP=$D60jdP3e?jdS_n{FGv^QjVbB3~$%A
zrfn9}hMT+UonwlTY3C~8ddHuQV_c)FfAYt(a7+U=p}tJU?)%XY__0bzw9}mPGitLM
zlbl(w-ig(*J6eb<*WYp|fKgKyEVf+yNN-9@NZ8()Emw({SvK$s-NRWm|6&K?E%Q3v
zfc*ab8*>=aK~hs&D-A_QMI{pP6O;*!it3|`PGZvK_BdLEJFE9IRHalAN&M{EOT5<c
z<(icSsMw(qRT_Bph+ZIkFD|$?EB`BgzzdWau*hpa^Yd=}{Uk<{+7xg{XS3er0}X8D
z{(1o-g7>|_M%n%`y9*mN+#vJX8{r%I)K9Ot?gQaHN4qHSwtEGSMQI9>1S!5f$!tNj
zG`BH#6Ko4VX)Ea^7fuYwPtA)Z48lx2B`3wm|Gk5pQSz~xgR*0`Yh18Qy<2+ed75`~
zt+u3rio#pg<uFq-=ml&y3a8O#=Ht2J(<8hOKT0uE50_}(AUxPLG3&<<4ob<P87t)D
zKDz`{5HA%6M_hV_>{i#Km8E4;TwHEd0nP|EDe2+Sk+A>ceVg}HtXLARn#MO6@{1RX
z?Y?QLfW67$sw!62Wz<IXC3~5!|BJr=1~c%Fb#<IT<xVrdXGkA4Uyk(VUkgW~(PW$i
z0~XLz-^a8!`?})5E0WTcq|A}<0xf$itbGt7;&-vqpX@ybgG>u*)Xv+t&0k-kVK`RL
zkejJ}0*XXwUY;62hPyWe4Qiw5Sb7s5@EojWvX3S+%XO`*lYmYLaF;0gZpvqGvVK%p
zH#0gK7<qV5mc*RF=eGCW4t0-Q1aL)pnDj-%T9yHtKT@d9YOs+H-)y}av(TEp#Ef=Q
zFZZpXl#R?bcqUO}#j!z5%JLCCMm31x&%Y=9I~XU}f%rT}p$buK39!uea|+j;E_iQN
z`$b1e55QAT;ozY!u0toU4Hib_U=kIP9S-9@Y95?|NP)fYbq*SdDWV;3Jw0!|TCmu)
zRrXp#q#gFWTbMG5gsV(#py{=c8Eg1isOsP27xI$sP@T=u*F4mJoZyEuKyA$_ykX_x
zVc*8EXX7;YAWMrK0)xP<I5IL4pvCS6tQO@C3PEKlfENq;BviYb0VS-twjb-24aisx
z=HZ9&{(+tewxBqcm8Rvqm}2|NlpCa9Gou{L8M70`5n=vVodEd|7|>(1*#`UnF!mJ=
zQD$wt#wrG(AV@5rARyAM<O(9v(%mI3QZuB8g0!@>h(mV`F@T5+(nt+3FbG2p>CiQB
zkM4fo=dSL(_YZ)#&Uv0+J!cXNcedON_}%Z7`pXz7?tNTFe3)J$%jllGgx|GVpMz?&
zJ$DoQa3^IPVkU*L3%AuN?q`VYj4Jn^7Rv2w?nx6XDk=i#x88HjSiDFFI(rLL7)NyV
z86u)s*j#tY)hk!xQd7hG%VzThjTgnui1RN=P}0$f`R;Glxi7tqQ%(_Pxp5=6a-jq2
zYuAxtO!8yF0pt<dR9xq~6A)*6m<E|_G3VFg$AvP^2|GSi-P7YW9x%Is_1zY&8px4?
z)QhbV@SXbHq28Bamex_bPD8PY?TIKtZ%J_ursxtsO_<)1rtS}b*mrX!;;NSRgYdAh
z6fsZkI||T=I`@$~YF-<2-IN||Ol*^HZP--P!{aCu_3PZ&jZJCv_-fw?U|R+aXW8q+
z`Ef>-k3R2jtu>wX_3_azF)q4>jNjWIj1y^~{o9AI0|cB9eszMSRF5-6lRLqX)PRZQ
z7IvQqD|Al4Z?b%Tc}UUT|7(fP{G|VJRb{0^-<!KCDm__pQTkPOR@T<ryg21~Re$Qf
zvaLDj{@R%sqlXCIX{AZ+h`Iw*3&-^WFxW#6)(<!~!q-65M~e9jtS5hn|FpVEH+nw7
zfGfXwU_iq)T>;_?;6KR^@c%TjAQu1D;Q6$=*?pD9;0g)RNGSDWX=HF<1dG{16zDy*
z;>zu%LRV|X^zu4iMW;+UExvujx&`OikGu9V!~Y0M*xwRy8sEj`WT>mFlQM`uGBXP(
zSKha3c`*_!B1~Aeb1pDZ4Sg}KJz1#I%eCD9@gO$cu}*kP4F|gZ;<x>LmJefhemo=h
z^iRx5l!hm`rSU+}cA`1gM3UYLInOl-b<>4klj2deM672Pt(O%kvEwBd!&X^!*CR|`
zuiJ#|*n74U=~v7cyo(tf;e-;Ds5EX7oPBxXksKTa8Z1J6kVJ;MO0oS_Hi@zDg)cyq
z&rF^2`>-?|J6}Suo1ST0?-rM3QiDJs>gwcP1TuP2%^Vf_RL28cXRQB(+fqpe-C$mo
zsi&oac4Kk<)=QTD6uh&-y$26q;-qZ3cURaY^miPLGurk(dH==SfO&j!hi&nxNZOCM
z3LIf0jG;J|bSV{dM!g~3D-^{x>@3d)$=LX%Q>~iX$493*TV4NBI;sbKF-&Eh1U*BZ
zH#qS*vUBQ-&P^@>xbDp-m1X4>)qhBb<=^7uR_^VQ!7MGBa(r=BGxzYj9y34w!Q4j7
zz9)F(jRKZ3l*L}v$XTnmA~qxOL-z&W2$7uJ8yrm!qP$C#ZJY=+(6nxIl&2>(3O4lh
zjkSt!!XR;(D#a-_UvE=93Lu?kc7tk%Bys5aNSVd)-aI2ex@=+O$ThLm!MJ}Ur{|ND
z2U}z3TpZy>ro?llZSInXOf2u7D=Az`kF1WZKcloXG#uUrvp)SNS`Sd`UPM1Tpcs0e
zqVhM27DQK)k~V}Q%q(sC)o*Mf1>ixI_UrP!!Y3uyWyA0LHLWM#$GTU;lN#&}YD#-B
z4BQt*R?7qhI4*Y(i60ycrwJ8`)Wwg`(D)v=a?mMmq&|DAcAb7~7uuxMsb4n#y!NTd
zEpokcznO%eqxf1Qby!8#@Kib)GWERKV>sb9trG289_2-Ejr&47M|*22y~e?31XIT<
zlpnX$y}Q|ug7}MhfJQgq!!3up1^Hc=L7X$;fnqE9v8WU9_A`&H@iSTqDeM=IL~Iul
zTUz>Ba#S+_oP9i)t7ZUqBFrY51d-B*RuoB4-*G0a($V=lEovAv@st-nn^w6fg6QZf
zTARCFF0=8~)L)f|5`FmWPs05apYQyg&;DI3c%AHkFmV*SPOd108%=)IE_|<bx$y}Z
zi+<8Zi@xw{s(vS2!b^0o$;!|pgh|Pmq-0@5RB)$G0UYi)Emm%GfWqlbI*!*n3@<!;
zOMV0HiX{H(RxJfxQE?j|08+f~ZvV==%5bQ~0VUPNCd0knPwCB$DL|;-+D2lk+Mav7
zCC&DAMws}+{c))?()gvW--TmY;xF5*w&u*vD876Cv<)|0iTPL@K!1qXH53Lb4$NK^
z?TT>H6Ol86`2e{IUlm?UYeT1l`|fR?wnMOxDA-du>zkbHE1NhnQekZyZO%&}_cXGV
zq95F-_gK*f)EUsvNP4lk<>m157u`oJcbGZ62vNgx*86En)^|VG3>}E=Ln9VP3}H*=
z-!>gkO~RcKmoM8?JBp<PwD`qzW7K9UGjpYi9hPzUMjLz_|J|>$<A$i>fLPWo&xgl)
z2Q~jcL>;(7IvI#+vHkQrCab5}N1>x<S{F$F(4^vB9;-B-OL<p_IGnIgjJHQ%7<w*>
z+*eve73HZG6^?%xwXg1F7~<J88qQ{(rycZ=+quHRbUCHlnAeZ@$rw7y1OZLtA`j#F
zDEWF2ZXZnLw}Dz?7_E>AI}1!Zf2vIk#KymAsg1{TBD5*x^#`kpK1J>;iuhEsOkMMy
z`N(rlWse@|5OKKf)0SYg4xcRa^91v_+s_Hq8Z0?=FS08ow{oK09j3BhrcHL!0^>nV
zEjlTQCjDMS<e5_^y);OTB5C#}@?|N4IjKb+tfjxPXKkt({5&S@D*RX1WTsntc4jGo
za}V9tR$<(r5BT=Oj`rAs%4lPW86rKW2cHT@M%rp>ukv7S+_*vX+pg^W;?JB7ntLHH
zUyf8erbwSPFffQK&OE#5hZytJ^-kb2@R<5=<_RM+^FIX}R=SdV3NK-6gs;I3HmFoY
zGq#+rMl1q1_~c&jH)|=^jA3mW$DF!cn1>XTEDURMPU{+fcE{e+M<9Q)V6#gDwaT4*
zo0^iHGPJ*9{SYB>d<VKG`$S*&K%43m#zd}{3^!J7k%T8*!!&BiIEQRX>9NQOW$aqq
zP1IICPZJ>dNIJxFu;XPX83hm`_RsDrFEWgL=%k-72|#*_Z?oUM)L<a|5Lfpalo^Z(
zm%KvfNcH3UYt`M9=@Q-tM<W?WZl0B56;Cn)t5~vPzQy;xF}ZR))0wU9ZC$t%gG^&Z
z4*(i+q>LB=Ci4&~5*(*gXM$sfnCI5v9>rCj^t;MQlD_9w|A0FBS|Z?16CLgCxS@ip
zJz1l|977P7S>gd4%|GNdP!%+(mP7BI6@tkvnceL+beLVwa+=LQz?Wz*-w6mu4UA3|
z#2+gy44k1^dq(2Tx`_*o*hvvWH!+N&!h4U+HYDBp-uk6!pw?tD2)NJi0*}tofJEtN
z>1q*ndh)Pl_REZ)Q8eDK-bn`%1%ic&Mzsnzq9gUHP{>p%iFfqieb7Ea=#%eE4}DtG
zX;A7-<tv?k8X0hV@#s)_?DGQJpFuRx&9j#*1Mcv}>bdLLzuwD=xbZuh2Cu5HaKkKR
z6(hQG_?n%wGhw>*vYq*#B0xn2&!f-7ayx9-uU}VFQ|l1n0-#tmUHmT5>gp;6vv$0v
z0jO+$t{U?qvi7EIrlz_1+lUB~jzCg%4UMA0Le16TFz*ilal}>Uqm|iz5U9YQCPSm2
zy^Y!2T^i;bCn?O^n10yt?{gb96>?ifbt_+UUqu?Ps!Td#M5;aKt+(u$v1v_yJ6+eW
zcl#{;-bn2#yd97Rll60HFLhcGg*&(%E~W)uwGdBTF5>DUSMLvYvqfh@vt(14f;>p1
z`?ZSk3b2ppacMFl``M#FyxleSm5FJ8(zT{*jNa@w={_3Yy3$Tv-kdhZ`McCeTTAQh
zd17P%={MRZjg5GDTfqF|A&}u5dQ!`P%9V2X<wW_D+n&g9li?Dn$7>iT|Kt7HDy>HR
z{d@PiUh^@tu@On7O8FlFWs61u8CD)a&Br7vN~^jQ5f(-#4tXps{VXbqY(GqKim3re
z3`6-tF)WH#ZoMma!6Iin6SJO~BI5hB-Rs%G0QwJ7wL3_S-yaRCYn4{>Qwzz8u)6D#
zSNpWrtYRZHl=C+*y2Hg{2|Xj5FlYTh?=#_^hu>YbkV|_-Me#nKOq#+1C4bzc5*yPE
z`)Dql645b3NtKdy>>{aOs0+i}od|~;rX!SU-R9NI$Hd%bhuz&*$NNjlbhs?i1*l6z
z-$OVe^TI_qI7*aLTt-Lt@c7_PBZbh+`a{D8uM(Ny_72eqB4If>XHrt9;^M*tWEj2B
zwQJY-`TLujPlha1+76V?0{4r9qZ5UK1MLrT&CT4L>6W>(Gb05>pu&sb;I)bRt2odr
zMf~`J$rvLeu3E`(7p)7+8@(UB4Lq&CtL|FCqF`c4o<~~Y5+{}7{YwjX_5yq)*!L<N
zsKRz1hxdP0K6A@BVTn)?UQTb=**~iqA0N*y2-*|jfAVFFy@GA61Eq`!7N#=Y`UO@2
z^nL;BF5M<SLVGNmjEs!Sg>~Ic5izk8LCl*sZ}8Z*thn7>i1<6`Xt|Z^e9uIM4cQZ;
za{Yb?U?Ro{NM^AtNg5yEl!B+gp@aYN#{SGPMgUm91;$9eF^0BXq~hXgx`g6~7H>N}
z1+u-nvZqcRI!$6!l!=Ld4@(|4i~_GugI_?PeYDmR%rAI&cmN@FVXZanYCBEsZB;k7
z%KG|)!@Uh4rzw&e-RH3`F>d%IRQ4Xy4B;UC+a~+wAMk%ODF|eLN*KyYN;Hy(21f3Q
znTdzjN+Lt+?vl(;zjv-JkXf=s+!il*75w|x@d}$>7R7{{-khR;u!g6n-En2Dwc(*I
zQI`R?f9CHG2i|GYQ~Wdh(f3lDjc1wwNrl+h*r?PBtInX?Wgd<?j)OX6dm&-o7+9yE
z8?W#FB|5rJ1meTi64{&Jt2|D9rQjpjV~t8pPnPRF@p+=HosgBq%#fgz)>iOJc`EZm
z*m>lE%P%e+5Wa&PNR$&Uv8<p0$`Di>S>)Z@*1fOojg2R7hew8I1BqazYGSnPI(5l5
z5Ljt>v+>hgv6SQ-&Yn!;)8n=FZ{Iq-awP08ady{r_KiXCw3-X1^F3+9!^8Hd;t3#J
zrVke&p2Rma5UL%~&jJJ6)j{{Tc;SMAipqW5$P+DE9{2RppdRXPr-l9~WdNRa?$~Lz
znFDTcAYwf&lpTICwzZ4rX5H}@JYI@w+jg>i{BY%LRQ=8u<AdR<KD2o<N>Q$O95RvV
z*tcgyr#x{P8xXN`P?AluGyv1gb#-y032p5^ij`bCkGy!(Y>WV@%6Yc#y0s`lqNtT;
zyOdRO50P6gZEk2dK3?nEuGX<MT+|Y65QjlPU0+Ouyjs9C`qD{>LDy$SMsyABaQ!fi
z92KiLE+1KLT=;GGMNM5@YQ4}4)XuOf*(iUaDlql>;}Yic$EItDtd@7Io>w|m;BaSQ
z$V6rN%Vg<zSM1!;<E5U{?15Hbj^skh^_3Csuf2>-Lhf?mAUmI()s)Wf^&pDW@g7r}
z4E_l={z22><a(*_k-=3ZRQ5S)=kClP>Jno|+++Wk_;}f36PWMWca1Zl+WT7e<o__v
zz8%qnoWgg`AhQ}{7`0{&)HxX@;ONLTYrFWX;cz6F&u#qfe<5)H*n%hD-v;H5v1yxU
zzj)Al<OJr-SX&Fsci`;U3=#0}{sr#?bMTE<tchXAY-u;d@bHqameuT4y=43uL((KR
zc<x|$+mwbn@JN0a4r{f1dzy&jA^}-Fe<fb`0>u2h+AS)yK{DNp+l}YpE#{4#9QP>S
zGT8MHe@pPa#A_=8Gm-vhGcjKz!{N5zJEMX0qsHA#T?L=&*^!0FZnQ|D;v5a!YtqJd
z48Feop6tA|uzhFQT}L{!-fKs3SNpAZjhZyW<eO=??OM|rTfA8j-J(0!{Psq89Th0g
z%ToEI*V(|0U;3tKe6^Y+1=rYZ`pN*J=E?S_|9t+_|Mf-*Qr^y|b+qEwjv1j>X7IlY
z%5a^@`>enoP3%IuLK+<Hl&x*59H}9E4wbss&Gq=}c7|9JdF;Mtj@uurmMyPY;$nwh
z1_*%Jp=*7=S&4?cVOPH%eEQ|J-X<PSBgY-+m(=c{mT4bM2^2qOlUB8@@9nH((=cB@
zJoX14uQ6o+{sM9M)o?Rf?n;6WnohMOG>LYG@yp8W+@iEF`tcOg#aF^`qea|`<Jz)H
z2L{gU0xcabfi+|m)CZq^7UkgYiXQWJYoy?I_c*f)QOnOZk*jx;G~O(ll1_o=h(*zL
zKB^m0IhJ*>vp*i<@SjNX$p}s{a5l+kg3p8T4+ZD7<^DY{jnJ|xsZ*S;xWsj-!{S<@
z>t3r|DCM~j&9Bnl!A(26aYjT=WGNp0gQ>{j>T6Bs2XE<Iy}BQo7R(*Y4-nBM<dV3l
zcExNRuOuHCLmh~_2AG>7Zk@tXQd9F#x1{FUKY&ba`KO3g<QJRe_Z+v%t+7k#rTBjR
zbj(`x>EP>zh}8Z#s&imy=mk>FU;p@hy;a+>Tdu<g2w4nrKYLgt<Ir~BBs5!<*zhs7
z@+19$wQ8DMJ_cg(xbI4uKh%AtRWXf6@=32d#c4~S)kE*mBh{z_sJGp|pm93Gjs7JQ
zKN~H#+cfJX#!wL^w?+Ke{tKzsD%tujbv;(&HI7l-1AD$wQ%NcYx-Qes@mfa|2yM<9
z_9t_d7B)*6qX&{9b;GEGk0SFX@{Pwco<U6s^v4+)lgf#6^RuH~c>~?8v%w<WH%laS
zHj1Wnju{#@t2<M?TOesf)FuD+P7uB`on0TIOjPvm^t4)aD>@_gW@HXJkKGTKJ*5oD
z@w=UCmTbkZukcQVNHM-%5V<bj^u-2^+0Do}KG>8GXzC^y=y>j|G1ksEPp<7*jenS{
zj$B#uy_aOb<6gL*A#pr;`gq&f)%u8Oi3fjio-+&~uWw|lwW8sD!IO1zUdl{7WR(PU
zR-$Y<ct-+$00Unn_qLkP@}OcAcY%<g#GyHcV`=TUIWEMl+N4#MCt0GQASdx`hB3qH
zvZL;VL<0}vpn0+>*UtvKD0lodu?8xF@jg&~k&!my@ulTi4#UJYM$TwLB8oFf)dq9;
z%Fp=I6}x%I>l#PB6QBKW?Qa&;Z6rlabdH_Ix-;CzKm_<bfDESnu(Z3_gOuQl=oPPU
z-CJ)B3E^-kzKEsRcn>)PrX{WbdFL{9$$QZGspmxMFJ_<FV{5~>6qm8E9~Akr7i4ru
z-1Z(566_Q6nycrMS~;GtGn=pXoZzp3m%zVN<S)B-vcCvVM2PrJjZ@N;top9!pvin@
zrsnpSys3OW@>fUUp~eKOXY~b3Ej+7wSH0ybE%|uy2LzU)MA74=MwaA&triaFCMZ&9
zEY4`pS#ocn&BV_W=O}^^@D~nQqGR-J+>lybT3)R^hQ>@;9Y{$W9nG_`?eB?in1=VX
z9ki6F9M58Tyu1#;3cIsB_NI;hxZD}Lh+lPD^WCOAe;Db`;kT`h8LMZSFf`)7D2X8y
zJRjcf5kjvYt>shK6&E=lw+wWz=y=LiY`=%31z#36=I@ma8y?e|0W<!yIoWfD_dd3`
zJv2VD`xLLWBRu#59|AQ+oC26AGytD}E?mdd6k%G}k4Eo&RfLA{UG%%{z2C@eGUxAg
zNES75@a1@UZ|ymw|7J-^udtxR#+$~*TA`^I68=j1_~tT=4DAf`(Mb75caao&d(MBN
z$uiY?JZz7}A9Lw=VOxZ;mczWAOJL`Uj-UG0R#?=yhW#T@wGtbgWJ`xt^|2KPM|Y3;
z4!c8JZ;VJ0d{<XZSbg+04lq(pUii=643ML$9G%wjCxjU;XG&;I-O+qGW6Tq&s+?Er
zef|<+^>s#uhTll`SBzYn{Sr279^=xo^=>q0k<LGQEAd&*aDSiRY%fvq`Aw(~?gOQP
zC&89LfHL|yIXE6W9yX=zW=}~SR~~n>F|0p!HAt!QoPBo3WrMq^{v0S^;iY&aQqov=
zx6DYkSo58s-(0VWk+C1A)NyN;T!DtVHrg<89UszcI#TOuqp7J0Q0a7Rxc<-YRvk66
zzQA}CQ?-i0Q5>+{BC_|zUhVPya_sMGve+UtDV*K7kJ+L)Em~J!(Bycy<m_kt$kE`q
z%`<!PFwe9#udeBMXH198U)u)ps@fbMO3dFyR7|o-*vi-rHlmBWQvW5E!Y18+KCnUc
zNK+ddO<{9<5YhBPso%I}5Fdg6a8xODyc4lgGPIA8a31zQ#IGPsq-JJu)!Ss=I^!;#
zO>$jcguqa;Cea2$nK$eJmhN1eyRqd_TOPdF8rgHa(P4rXtz#wHXV`zcKN{@+B;xq_
z<lK?OniQkb{-p2DeT-Z0;pmMGe=f1@j{O6Yt78m)d^==Kx{?;^*EmSO?dK6lMa87o
zYOT@GP!+Q^qP@M>+G+*(9xdG>!N^uZe}ce+SvC<75jHltgzML?J?$?_&3yOH^7C6+
zAY&@a%a_5MtKaL(8W#=VgdIjo-EjR=xvlF2&@4}fU%d%Zi%h)60Ri}DWeRdSJGICJ
zT<j_NYq<9|h~UKFgF7+;u)c>S!+nO6<6e$$OmtLx1=FNpud-rp5ZMz%>+4H0sCSJ9
z&UlLGNR{AiXp=|e8zr5IUw@fhlQ@$0mXlFL>&a*gw0bD4(NT-_WZc=J^=IrD7#Wki
z^B!KG<Ej7ZSWiY^*3;8{uedh;V?DfNZOIEObW#!nnhku@q(}Zpa=7H}mioMo`@Fj(
zfmZyp|EaWLNExB-&C`Lc`~6?QZg-=U03iEn|AfrY3&RZpN0=$1r{z-#*2)YM&4(Eu
z4=m7z;kG=lTydyqFa|Se`ucW)v|C;hof;1xn$8lp*dbDc9Tikn$w){5U9mPZyAGR~
zpO1RC`Qaah^5jFLP0pe|T5KdY`lu_}D`L2<^K-<v(_uDvgCgMN>mR3EmlodS+hwTC
zmw-dkyCYc|1nDcOiG#3hQ3H1$KYsk+(`<&61{M~lS7D8F6E%LNZV9uqvpXaZ*vaSz
zhlcj{_klr&8Yc;dF(oA>okr=>FfcGMGCE7l6>@{`1c*rMdx+L=NDf{wV~+$wm-A+Y
zv?a}bJ_jpO1;xc&+<tdNMaPT197T-RyJluAvp#>4%!a`nF$g^ib3~luP`*}cYpa7E
zkL_YdJV!x60pO}Q9*)+~g{^()#vCjsr&Hw-;qpWFB|aN>utva1AOBe+Y6B4e$8Ea?
zTak^|PQ16OQ_-sSU#AL3(NGtFdDZv9E-p=i$~vDRNm)S2c;&e0R=2#o4Cp#vXFRw=
z({mRuhJ}Y)s_ScL$jDaeuS7G;<!VYxOB)+AK%E~vI9H#9$K$&o>~yu2TV~|0xG4i(
zhpD6YkQaa({ij!fL0yT7Ra7rNm2!6B;;9pc*$3lI+`ejpJwpx0r^7v!1hgaIplm-`
zujr5K@8^j?Qc*^T00PXxp;#A`Af9j}EG+y48xb0+UtvuG)6~)`WACN`0i(P;@9pL5
zsM(xZBpod+2qnlX3UypdY$R->VQh>ve=kgf{Fs2=&E6Bo<R_>l!UqgTw`q`K)iec2
zu{bYBLF0t}zP|U4vGZm~I%?`ig0whiHMPqo(Vgp8u3jY~BFcfmW@l&dd7b5Z?ZXj?
zZi)PHiHT8PtF>sUOCJ7!_Q2HY30}JZe57q^2Sib^$-zO~($z?qhUa2$$NJavy<qel
zhr?wvC=ywI*j(A)TG|uy24B6nUR+x|=ASM8<l_sb2#e%s5o4>{hJH$f*anuDVr+Vk
z9zML3lE7)y#?i@oaI-$C8Vm<1EBp8$vfOmAN@Tvoz}x*#s|&J3TutFw_P>n0`1Hi4
zo9atwmwUgP9K3~?7n6A!TV;s4hV0ya^EMDr--Eq9$H_(s9-gRfKH-U4g@U2EnHiAx
z*-)nwaS9I&%`Yl~xmnj*N9YLKCQ*veM%-wCPk7KNB3A2`?yLKz&1XiP1mg{Vf%>k8
z;W`6DpY;fN*(GrQ+G{6F1qV<|lD<8+o&A5jz803EI<K6wth}6GNGQHny{D_oa<1Jt
zmcv`@a*~~%9?gNr%2*Y6`Gtc($0nzs5IRj~g6B#TBA}HSUxtNYVUP$nYrR0c2h-Lf
zt$m<67<eZR*O3KZKQR5oiN~COJ!m35mLjm()E5r4g9j#ik=h+9OP9X(eFs&6E{&n!
z<)4)}dE$$%DMCh$5=$Rz>SKPMy5u{Q=k3w?2D;v`ojCFP#c<Ds_YkfCd<e_G{54TU
zG8gIx)X{v~<fps2ie#H>mbnUM&L0vZ_r4Q_&EHNuwFtnIrbUZj!o!V%N#iL&?JGzN
z1&w0IRHes+B*;3g$G<&hFG{j5oYbM<2UPga2e5e_0e|<-{PWu?56W8gX!1RI6+dAP
zd*??BgGmB8pW8lsh#Kf;7)9I}t^Q6~!gav;0DtgN&lh~dM|~70PDGvo2_(?QEG!t=
zRw@fV#fL5@Wc*o`Sy(0#lWL=3gM+CXJ3BN1)SyoM5)%CH!Job{gt%5@rJU%(dLL(B
zE_Qhi^4AAN^XX_UBO|d>NHN#BhRvNX%~FRe&T`2q2DshEgVhV?lgwR>2-w*1@$uBu
z-}Qo?K0ToTe-acE6C)c&<v39%#KO{A+*uhN9v&_eOd72d928V+(&YENffmUDrmx>n
zhi`l@x(nw|oVyV2Imka3q2tQZB{q&U&~W7zbkn={(V^VFXxMqK%N7PRQQxm$YrI5H
z?_^-4L*r~~oA>$A;L;YS>fO5m@dCsJ030EysHi}hTyPT|&R<+yeDx~&VD!ab<reGn
zoyvNhI`Qljmc9xs5gp5n!>?yr#5~@yuom}i_6j<I%9QcBxjohyt&IZfel96HkSehG
z@!_tE=s8l-xiOIEj^no<=6s?le^K$5J*E;0x?bIEf&n=>x!~z$%;4bQ^z?Kz@y1_W
z?}vqOcM2)C?g!(xu%<6Lo$+Zz*mzV@i-x9NUBl!Sx8JDOYm^7JySrOiMMao4V%b7p
ze+=YEgjkV^quNFKtqqms<zVJBerRYYGc$8DG2UEu(~E$BZuAcfSg}?926YB4z#OqS
zBs6>$_Vc^0Jc+H!WICA1DPO+_*$o7s=7GFkH^q>U@m%iZ%a_lbIU}0JEhgp-AW_#7
z7}B1X`oo70Ow7!q_ZE_qX>Yw9vWxotjx_9DM~9QOkB_*3amRgbN88u;hE~w<sN_nU
zUIwZ)dN6bG9oGN2&cbQ1j>nI9Xi9VjeR+*r!?mzO#LI?4u5*I8F#o(o&|c{1$VRn6
z*-J@HO$8!xMn;nxPCY@ILD3&1H*4|tWo3)>DuUGW^YcgItcA&v>~D#2iC{q7EtXAH
zKVR*_kgVkI)YPxz;w&Xd3q}8G@206G-w@ed)f0a_R3cp9@Rp-a8vOuq%>CrFcjxh!
zmYN!q#Vt>Vkyl^lMa&MiMb_6zJKi>GFVTL4=qbp^oHj}8+~CEs&|9hNYifSfnJE_s
z4Zpd$+1Az;3WfSC@=qu=829dcEO$aA2Rz1O8ob*xh|&)K3om}WzA1u~2njY-0nIB6
z30*n>7%=SnKbj)wP{jH5v|<rDbwS$Jz_PvkpJpW1m;T4T_P*0~?+BDb-sKa&+C8YL
z+ZFg*7B|-|cm+j7;A88on1XM2FXA|MPq=fQjgC0z&6{h&TzYkpItj|#Y90baLYU;_
z<o<qj;{>e>Sgn%cGU1yq022D?R_%F66B#v*ExjLpZeE?JCmbGDJ5Q!3CblJPC9fL7
zhU{FQJb9ay1<i|~69HdmXn0s%bVI8G$zjCSb&Dl0+d(mbGc(G$anBkGt>x_fI>~4Q
zU9dRu_tN^?8RXJqQ^fZ4<2|#KTb<+_92^=N8hf0xU0q64zOc=&UmuU6Lms$wc65w@
zz+W+oh7}CB;W4jY=hBRXhdSd5aTzhb*SNVO6Q>{U2a(X_7Zg~v$`Q~1`=$6hX4_18
zzY+=@djDAL^sEFYpKoe8Iu>nBw|Mw}?fVXqe%!ACcrPwQ27RUf?%bOsBLqRhIl_e`
zS-7XApd_aNp4W)o*zu<AA%&gfHTDiT3}^Vc{oHkVf@KLzy97@s^)jI2X-pJ)WWB~x
z&<5R;r)c*^UnuuRCsV@B!fDoC|AwVd{E~U`J5v0ibWP77mo5uW3Wx`h)-9<J(_+)U
z#@@Kf$VI<d1>dA@awkdcT1F+5Yd^(D`teGZ{q~pE^}hyr?PiN5qWJEB`26_nbOHVR
zlq{on%2K_GmYcQ_#w|xmmp0{g&yW$V-j+V3$>1xATi@2!XGPdrjdIFLm1(W@dp7Ts
z#R;3P>kZ2p4;*fL<~cM8W_v-i@s!6m=vPEN-(oAIZM{c^)ZFRi$h>>>7pA510Gu4p
zKKx^NnW3e&TIP;*$;)~d6B}>zq=))x$VJBxG6V3)JghaMl#qp=3I=g7KITGXhQG-9
zBF=^lUHP`e;8DIE^tS4@dxpdubRdnPB1?#TeiNa!#^-7}K04Y{nKYo`4-8BD<&kA2
z3q6M7>>Dk#WIV&X<hH{n*p;fgjpKG=go^8q5^FrtIp;--I+O$PV;5&^HlGG_|FbWJ
zE`n2L;Jy+@X(m_Pwc~5gDsSjN<_YVoZj8U)c|R1br`18MN6R@Ilt)vl!9N1rySeut
zTMxk@zMVF-0(!84NR!f3N=xpd=uZ`M9nWVPi+hTb{p-%E%$cEkELsVaX<}x!vc<ij
zSF$@0c2!Qa0F^bNA8*Kz5p*7WitFl%5Y~Km&ySqEUs@)|36tuelUH_2%I+kV@*mjb
zH(7TqJ!5SbVLwGkr%f7q51uuBEd$*}p?4+GuyUo3n>$w0Q>upv-p@4dtj9!#QLtf7
z;qKmQ+_;XC9}r@_=p^6TaJ6K3gqKT?mc?nJNjc%AUUzurr{2T8g3ivU+4-g#H=H!x
zt#SiDetr*FcH=wLq}n$*EQrL&kN9j0pjcUN--1FwnPE8yl+|C{U%b=WAZnnST>jB$
zmt|h#I&X=!X&ZK%Z2N)9MvD$c!77HxZxs5jN9nlUFtH%TL53Z3!xob8MI#TY(&#P_
ztb*R}xj6n7ELa|f!>qy@LA8^-808fHfdLbFMemse1MB<uF;~d>tx7|eV;KW8?{GTQ
zejee!_pOh1x3@=IojZ5#jT?u}O?GxY5LyKD5E|<0%MAh^ckKv$GGs5FKex2DCIu%q
zO)^rK@P4BWKW?0Sx)OKQSlSY|+7iKQh0z~vtqkN{JRLd4WWBnil3#dp+LOEL*q-I?
z@(JKe)FrLZOV^-fnai}`DDbBo=d%&`i!?`vSz9}Wzl7{gc$&P7BxNjQs*muT@9_*`
zz|Q_eqk%R70{%QaJiWcWCUtJOQf;@b|Edf>)CGPeq}akelhH_x@<SiHN+njF`la}-
z_v4VF%DscyHr8m}Bq{r|N-6T3IXOpy?4r(5gAm^YLh_m7y=POI<9r;dmLI91*rG!5
zG6-H&G&g^&?OkQfuGLUPVxqdewyT4KgPB<-?oF#H@9$E6sieSplab~=-cF+BfpsMd
zDdr5|pT#k8XlZC**Va^_3vY^xdr&A4GOyJhtF;}--rU@*tgJL}s%7ElkIAs9`40*H
z;St`};rYqpial9aKZ<9&xGya|vN>r`T)*rnqr?xJGLiJwvwQ7x`WD0P?C86J_y#J>
z+Vs_XL@U82Iroe^Hj8RO;^i*S-hmE<Q7!lCdl|!z1uo`TA?jp+ka>8dkerEJRg@@*
ztE(&ULL=@oA3RvEg}NSR#nJFs8C2P6xw!Bd3>)a_)dExnuw!4}!~IT$GA6GEKj7HS
zl$BE?f9R3l6A%z!W@5tl?o3=Z5Fq~LBA=xi10}O3>zI0eW&X}QcT%I@adt)k?xRu+
zZ1EGksQ-bmoks>=;`v)-?F#fYplckA>5DrZMJwIg8@I?)>*P*K{)`%BJ<twyYv&t+
z;c(NLrUL_GVFR?z($f0InC2n_k$3Ml*BDtV@%(ZuXwI8A`*3cfqocrKv`F9(G7fCM
z1b{FNO*^YJYH;x3<sxnGnNOE*#lL#>Dlt(GLq|^j;Rn>_@%mm50)jVJ_D@CF0jm|z
zcEcRG6F*Ta&^dNInDj&^Lz6gen$+9Z#R`}}3tZe|(-f+wQOoL>#`I!EtAkM`3q`cH
z?OCQ??xR@9#=E<=8jy!51&g?;Q~)CdiOx(|C3ENES=rcZtgPfweKBqQlSt%!Cj^MI
z*bf(GUQyQ&hTUOi4*bo%H(g?;?yzzk0^}5sak0%qa=<_TVG^I1D9~xe_8gzDiPEKu
z`|$HNwf@7ajYoHtoENt)*|owov1K#z#Bd;~oi^k~#rx$QwDwTOYT;g&>#+13%q_iG
zBcp;V``s(v`m@Rwigxlb-YJdD6Y&lFC^z&i?Gl~M(^#KVqxJzX1Iwfrv}=;HXQ_Nv
z9gEjQwfH_R6fgG}crh^}Kea(plDWS|fuwX%aWUoKkFb%1m{^cDViTU5`?BPd74*Tw
zz`($P{{Hp#_1!_Mx+JfL^tBMHc8H$tYt^J6e7^p#o~p$RA@gyW&bN(M#U8=`Jbj3h
z<X;qv+PpDy#M!{>H*bEb?B+F&|1atOACLXb>IyB7$*kB%iC3~Gk)5Vs$^*YYUMoVZ
z{W({_xLS)pBcsbOPFXRb7#nXD>l-(rL}!5j8pa{BmfSE2Rx9sUSn+-)ejR>v8^OYS
zIvfSg;*iQdYX3>O;93&;k8<%UZvCIag=e4Vin(@0zI#N~UG{Gp*$?G`e=Ho;iuRWt
z_llr(CR_%;@!0@Gmytz1J##B7mQmIWSZY#II}<z(NPFBsn+S3GcWol%G!f~?)>gcF
z!na3PGb7A4<YRt`O06nsN+?cYs}j|?rOpNhteIGOpH(+mN)}zPr9brxaK&pJ3{;z|
zpL!Qgk{SJjvtPPGUD85hffQqH-u+1a3#fCuujHwoDI)7qxGHDB&rsxoIfB$CycNRN
zJe2qTCvD<C0Q%G(jl?g2<0-8B?H&h2Z7q@OlOsD`!6mmNI^i3fMs<A&>K!w|cBAz>
z1mA;DU3bNVF8|HXR&J&SM${RviT;Ia6ycr5;*!#j6#i5!D5?CE*!T8$#+Vu|%mj0a
z2yf%p*DVU`V%d7Nbx`;3b#dPuu{QE*%RkIL^OKhG{k^#8V_D9SoKK2Ls#O~}79OLR
znj%BCwav_BqaQ|3tj9)|aR_a}Dl6rLomCRDZ?H_X(emmVb+oaa4KxEq3lxvfpFfAH
z@&j$}PkHr+rSt9C!LDI#W!~HuyKtd;G-d!3S~<p|Uo;80z9NN94+36R1}nqAB5~h}
zQhfq?#FHnMG^b9Tx;fE7qoANb#id^fG}pMeD^{M!3)U(3+`H4nbhhd^$A5(rRGhv8
zY$wA%h#DNZ=rbtCbURwnlN|deuO7?0jYA9HVLMx7Y(e-)p!ALBP=4k2dR|!ljgJSP
zd8OP^`==dIlne1)9j7~6uyhN{#l<B=8v!cCEc?xyrRoQzw9fbM-?y{N=PY~s7EsCi
zMi!0_F92QfN3MOl7ViZxecSR;ZT5sa+nAKkKmrqk*@lEQ5AFO^t7J9h%A)OUu{dFH
zm`Tta+WA<|!a`ANsPQg5rhkM-iiL$TjH*az$X2+lwq|#BVc~1@Vx652IUSvWuI?kz
zM|{-8ZBdLlfnf3#bQX@P&WJ>H>+WRO0PY4CXLNjiULJmDCr8}vp^nb{(vrQlc@m!L
zDj3k~5d6gwU=uMed9hP}W~XgO^cJayc05V)sL&%?PouYhTx}JFU$1U%k4Ri&x55HV
z7Y4ZH$4*lNn>;J9qQUEGzn>NDtE8xiMnEf&A0h1Me650-ENyM=V#C_<x;jxo!MLMx
z^}?6NB^e^l#$H}^%g@zc<LBn*fu`!a`B{#+NuT=XmEOTiQzAFwvqLil8bDiZT`bkM
z@Q|m^kz~}1et7y6UvdF+0fk@8%ybqk@J~J>EcqnqU`xdQtl=P#akG<s(^e)XHBD~$
z%IRPAodsavstK3tXUyDwu?D5OZ@jh~mrDkpO==<|y#Ok{A6h=%F>dV_wm)XDKumD2
z5;peXv46SKkra`*1mbD9^XK`56V$IsY2IKm_s|>Vww(shJkM@vzU-Snpick)tJ(l<
z3iO^&-ZIkCBi-RqQ8qwR`UF!Bmd~z8hSE7S-H|!@i<j}3x@5k{D=O)7`Mq@v!0eB4
zR(_op?=YT{_w);*Z|8jHRP2N^VrZC|R`-yi%|v!LgVeFpx}BbTzqnz#{KF*8fs+KQ
zHucvIDGfKXyOnbC^149O_oCyOoNro$D<0oersF9TIunejEB$I=dpq%rAJGK;*nKPz
z!FoD6I=*9IcE#&au%?KJh;azu8^eWqGVEGfT3rv4N>)cJf(;Wh66l;CJz4;XL9U@s
zw;z9b55$0U=v)%x@ow9u%NnM^a!g)c9{51jld0f`z<$M3W={?^ZYON3Vpv;OOm@9G
z&12O2Tf>NsZ+>HZ$1d+*!A{{SmQhM>?V*4<Z&wM&Sn_jZ3&2;`CO?BFH7O<>{UTbd
z>E>eLEmxA{xvDQ17h}KvJ)~i}(CN!xFk#9K%3Bg#^Ap7+^$s6SGoUA_J1J6dU6V^*
zT^0`LTLyE|WTCFKJUA!dl;5|wcxT=bPI$^<>^Wg+FsKu7sHvT<i-6G6gDN<`wl*+6
zo*ZA5C`m=RQd;g9kF>^Gv6GOHh}U!g#p)l>BQz}R*VscjNy=aHjsUc3MZ^aDmft^B
z_q%EnkXjN2xZAWq0*tNc_qhi04=a4B_F{1@jZE=C&@dKwQ}1|Yw8634nyx(%-aq;&
zThY~b)h|_9I^q1SC|NXJ{;aScpTC%Mih1e)Wb~-L{Nohp(gl1%?gnO4ZHVF`14el9
zH~A~^g^d+-_xb$m*RMA>-87<Lr}@qHJ-~>S4`s1Awn`%B*01kx4?5CihP=&VB8=Dd
zdcrU_^x#8vd5I-=hgNJ_aF`(F+7hnOFfsUQXA2=gNM8HJeg635`56Nd*LMs(kHJ@J
zsPXSKd9pR^X-vJ({8$A`e?cZLBPQkohk=$}uKgwKM5QgoC}&27126TnXU|3|Epej8
zdcvb}qurIQz)i03Kg2RBdP1@luvlzS6O-<c&2^-V&MzVAB5jGLd67;|tygWyGQE_z
z3I|$YvAo_`O?gS>lX!wi`=C)R@5VzePg4~XO3sST%7sH%B9s45d5mvMwVH^Mei_q%
z?(KCx*j@nwq=Q}wu18D1NSKX{O;j{FajeG00we$&9l2lvokI4Ae)kXIz_*MsAfLkx
z<P5agWcUt05{#Vb3{CxyEA&5a;ZmF#Kj!ZI`A(12nAv%aESP~jK6Yd4ePQ>oh|-)u
zhVq2gTyOHp^Pc$a*g4$cBE{&EM{Pt(`a>EZe7V#dmMX_vg#hIi5DA}zC)U=iVdLsv
z5J=7P3OJz9%F1dPU1g6-NKC9y2lG_&N=k#o4nD1Lw_)Pthm=>Zwkf(i$jtCsWEW-g
z=1>P4S&^sVM#Z#Qqg2m&UmG}f;%~M=XdSZC=5Tv3!%6IrR`o{himKospX^K>T8gfA
zgS@@Sp@BQ2fzt@uDe`7Iz$8Ig*eJ<C54$*w`rLD>6cOienWw7RzWGT6!I^qG+Dxmu
zsQ*1=#U`$kOfPpU@pMx6cVSTd@fZ-j{r!(Gd$UK+907Fo-Q@oPVZ7ZJ#dK1Q4Ge7!
zMMv_G?R2h;*T18a+j1^NSPG$4Y6*c7aXmMZf}wk$d$r&1K78><|0_=TXmLrD2Lqw)
zwzE|49nZ2We!`88@ym6MlNxmuK<jc@gM}qv6mQ5(T)k?KOvOjaf0OinZk};qCq)xC
zF2=Lp%v*+)6O{P(>Y;kCb@u(3Sqpx*WsjIyxmRA7Yl_Y^;xu=#Mdu$S6h4@;g59;{
zxEV=oa1trD|1XF%7s%Nsxi(X)UU7XKBIc}Lkpu!?XA7;XK&%jJfq&Cv^`h>1f_nyE
zFq9`P2fG_w{(w_{ZTC#4(J59wPnL?6VyTuyl>TV-;uA6F##m*w1jlHr^Nr|)h#-fH
z7G03btFGXujDDs0kzTHF-lT5<UjrCjUcZ35-zm)b@qO(e3j=<BAbwk0E{nS5c9}jR
zB}uFEBb*j)WCgYIA*GmSW6RQhxASmUns=atyP1zU!!NjPRySnNsrMFl9R2L=qQ7!B
zF2H|E>uIXoRLm-a)$OOfdc`iRX$e2<%4n)X98wR-wz*xRUeGZ<%KyLpTM6g70Sk{x
zWRS>yBu(ltP~<xhvk6J(&l4^dflwI`(liYZ%el;qy@4G4i1eaCIxz2s;?oGjb?&%7
z&8gB_?%8kdXekJ}=jP@vEIevuqX8DyPi30M5xMjn+`oeSg9n|RVz+(xjM@K$alput
zQRuKsSz2Vwn3<ac&haPy-+ue;VcpQ;kMNeHdMZ=_$os}dMj+Y(f&}T1>GS(yO72O&
zgt~z4U}umi;9eVjWna8tTpNw;F)tO?6uj}5qgj%MpD753S5^)_5?tK!O2~Qh=6wfE
z5E_!FRiIsTftVP0gO-jLNl3UjIOrnlx+Z=FsQ<i5!4Nket--0&gqWCBG;l@}m}H_e
zIy$})1z_EL%b~C|>FtF?L~wC({w*KE8&#hC|CFh+yR)3zoq@LVsc!Uq9tqG3U*hxC
ze#OK|H#tnHYy^yzo2t{$fHe&cCieF;l*xKgc9P*%K~_bOkfJkM2|6*uPyQVtO5Ga4
zdBecauq$Efzwk8qWBgC(G#6P|SQr|5fqzB7rsrYDIUeeupNY*gKqy_-sY~8Lo=0Tr
zSgp6X!*d@6N0hMkk2oN@)S!hz|8EQLU%?=_E>Qg}tCeyjpzAm2mq*H&`S{dEHA;{3
zn*Yp?{M{T8{e52W8;S!??fQ%Qvofv^`;1y_5McT3jBnUSq}3IajY9Mag`}kXLD-%1
z1;~xWME*TCLi6b#u@OZE#k2N#_i+-c3JPBPxSY{DurY@Nt6OJ(#-|s;fdy8C4+`RD
zZZ#^ml=Su4=x$0(WbEwqcfs|)02?>s@b}wa%?!tRga{OYjDVY%g{Y~yE>F~kS}bO8
z!ZI@hN_u9YukmNko~4brzi;qQqt7z`2cOUUn?cv_n}6Tb{*m74WiKk3+;d1V&A+?!
ze==JYU~#@&2qNj#(n0mRn7)}z;97Eo-YrJ@?V0g>iD$V55xd+LgI8cK;K#D?M1+LM
ztE!4@jRbU(y>mvDnDi!o83V&)MG3vp)?jv}z`l?~!R&8gfd3F^D{A8Is;Wu$WZPxl
zJ-)48aHv3Z_xfJ@cQNQCkaKS3a;7Y2En5SD2yiCAoP3fi=Q@;dV4YQXi;=bbN?HC+
zo}CaE9@iYc-e*GxhLU=$VOHkn;l2mk2!wcbA<Aw3R*`l}Y%DlRqg&B&yoT@At^Dfh
zg#M)2WFcdmsF`hp+XtI;s5LlDgVA^MGOoDt%fEYoRbTx)`i`bj0<Ap&Ci~8qR&JB=
z?2n+wMttXVf%_BO546ee+Ry64@{b)I3gya@ig&=-9<omC65`^9+a64fy9tfpOdm%m
z^bSXrRVP(-cez8|dcbKS;Kf@idMGaq88(Csy&V6g<kQ+7@t4rl{j?r1uud;)xNSvS
zZ<BXwK|QdbETL^SFGA5L{v!%C*F8?|{$twCmtp-)4*e!^i|bG3Ve@&M_@r8WR(=l5
zem!u&*O|o4&Go5t>NrH=!<=`}PSdrV`qk%K)I7Cio(@a{6Tl~Hj11%UyUQ&}aDIbZ
z!rA$9$hgn}{P!z$=L0p?4OtfHx-#s4DkZenAHG9OzbpHw#1|5jl|p<4SOXjx1hnC#
zd;Z&s#r5Y(PEq}z@=+fSCY~L@c^3Eg7hwIK<JsNY`lW^fU)}<0vMMkazOl{>UlvNO
z*Q{=W7bT%&Z8i`>NTJ94R&SzwSGbJkRX#l{US7<ti_8j^u;FsUXAJtTnmB3rU9%U?
zb3kYQUZFDsEx=e<KcBM`(VoqY;QK4FBkuM+Zuei4x%9=x+Hf_>aQQZB7(J@U*kHZ8
z+dXiWGx-rpzUT0$0MnC>W3LFV%G<18V3?s<AMDm%uhcTywMOM(6eV8bFLVQ5+B(|Q
zrm!2MvkG6ySYDF!?%K!}ld#@tN_-*4Ceqc|<B{E+n!Z2a7;IwEMpy@s%+*K{S|c7y
zY%h+`ZFyj*clsUgsIa#hmN~I@@fq%^Pkqi*VW<enzW^iCw2K)uy=zjap}y;iJE_xF
ziS;p-=@Wj>H23OECT{x_Sv4E1(Gjv)g73_D1VrC+$Ijx>KppSl2^B5qFbvFwa@8ct
zKH6J0NsqV!ugH(1DHohsYOstbaJnGID#j8>Kc8hqs$|$T+4BJb1cr~<RR3@ifxzt@
zKDq7g?lE*Y0KQ5}zmoa{w6Lb>tt;ER&lXEx2eyS#CgQttt)345Vy&8<>7Q^mk(xk{
zE-}pD#6;d7M}{y^NQm`%3i|srRy104j6FoFTHQR%p8CV+{f7{g_~VJMLc8@MJ%YXv
zzZ!d*7d{({>>~a<Av;R$ZZh|;3`KbQ%<%6l9oP^(*m0e{JP{ch=;I`yYN=-U<a4!5
zea++jal-AeafPtQO;#b|5RSA6hdM#jK~JzDn=<M(=P+m##t3~2G|-<Rzo!TusgL?)
z4|hEM7i(5K7rOsQN2cw~b&c}Tur`A+)vErou;8$Rd%c)Q<CUo5^SXl3?75@rPJXMI
zs=ZT;?ZqjQPBm`%@xP@s()tsghjeDgn4*WiR3vFRnpFK(a-L62XYa7S>guf&UBuXY
z!U+uvo9pJ})@!^b*eNd$*>nvAD_6#S_Wo2iqpMAJW;t((PkJiFb>pj_sOl=xP9Co&
z9~5uA24~8_okz6>pl(u)+RdEy!wzhQv4wT`C+sYND@!=*kQ<c_*5#TeFz>t3BGvw-
z!M#+&5414~MsYuSpL7~lt8j$8IDgmQ&ChcC8Ew_>)&0eD7?qC?S*3uUJLz}82vW?V
zY5u@3W=n^>J|fiyifM(d##~MYLm~?mKt@w<k=ehHUOAf!9r}dIl=zhgYz|S~GX;|f
zuY4Hdv%_5+;*oY@Jh8AkVQ_we%gEaQc-KiQ)pp65cT1Ow$%$Q{l(Lf;XX#MgTY(dF
z?$_aN7+wppzIH<q-df3YCb27pzEV&(r)d~dE}d0}*WVG$!z&~nbQpha!B&<Ry{oAQ
z+9Q+19Aozfi3FiP>Fycic)*=<n$CA0jX#ss?5dixAvSxTW#04b#_;9vV`gS-D$2Ld
zp52$dkFn$x1f4w#`}Qrp>paF37IzBZK3~K)&FF&G;0WYXP#A6gAxH?PiFtxVimaR*
zhh%&1nSX+Y1k;noOAM`sovke|ETg)i))bjWgmJ5M2P+x&xwrkXrD-tE+5(nOvZ@lU
zgE4~YTvr<(^%Efcdfe#VQNb%+gj6BxX$cMG-u8}|(?pm%A(S>S$jhMw?<cFXi^Dw0
zH_7NLLPtbxBHl8t2{aw&6Eq9*-bX=$+95<~Y@BEpg@g<uHDDqed7wC!0>JrqwgMS@
z4s~PN-{|##^+v`Wi7jv->?YcLZ*^dW#d)N0!Lv`ydNw4Sn%UtSumbYz(9#-YrOL)l
z)IXh^R&;Bo9bTmUt`9=F8>8v>Hz9k`DuGkK@)7U|+1c5*hn-s7Ta#IxMoP^97|Tal
z{Wlc+Ur^UfzDyK!oWMI^(KddfApJ1>NWoLwdX6>aoN|XwqK1X7hhSdS=1`4WP*_`v
z{6pOB<yr$+PCR8Cp@+{^Sj<Q-2qr>ZHo_T_Tglf?0w1xn#JzY3<H`_@dA#1*Qaw&F
ze8N_i7VVKV_sV($TTOAb&Y<o=?Wo+mUSB~SexLI|$RG|~cYMH}MPbe`P*b<%URm*!
z3uUxjeVsL_|0~a*yxubu@cMLRQKM1&DFW!2Zhd~c?h1kj+L@ioM$sRbx~dJ=wFHt>
zwK_0<f>!|jq96yx8>zNV6LT7}>un<?Wr@N0ja$OT`Aa6pqQ2Y9o)@Xc9VEgX)KOYL
zx$z*>4se)0lj3-pg&dfyUv$^2k>WKr`xYSoZ%5<3SRrTY5PL7Ox5-M|Hs3#)V<Xne
zTdwThiV^B{GIgw3h2yI^k+fJ&gR_OT<n}@aVZ@3~?e{ckhpkbcuyD>~J1=Qb^q{*L
zsK$;6w(5_)MW(~YcUl^*Z!g>ht>h<rd0pH=lrPrO_I0AO0W8{Ka9PrVn?jQzCSLS$
z+x*&2RTX6Xka-Sv=-`9C^@F{fz?|0JwYa!c^QmcNX|ZxsnwbumFhh2UkE;C2A3Hg%
zHft61dn`XtR5CQHWumkeV*8#e{Tvj=&nyLwiG1<m#b5DA$g$njcN3tq$u}FR$X;RH
z-H&raT!Vd57#bYhJvh){5X~(p*x1?v`>}W{TfOJy%X3e@Fv~?gP?aqFCyj09%m$X^
zI#M%&CR+(TqM(m^s~s^!0#nlY8q?urxHRjn__Sf|z^YMnVaAvv({x`sK18tozTchY
z#dop`R8Zxe^M*bVZUMzx-XE+LQfLnzrpHk@8Ng_I;1uosd)i%wclXAO2GUaZIt(HT
zQkFH#DVwYl*A;eDs{v`VzP8>ap-mN+@PW=D;d*281MEknqH!i_@;X{FRM`RZCynMf
zvy%Vv)2ETGxfK87i8b;>ZX;a{*VVmwlkSeOdK*<QY3Z&v5rZSvf7uh_)5!@5)zA&=
zh!DJbHK4vg=&G=403sQOPH|yzF)0ZNtf}w{$6M6We0NG>dOEjQCd*uJhKaGUv6>oR
zF5S1>@89jjt`E{B+zi~PoUR0?*PKTDw<_*NShlf_^1OoSeWZ<*EtLt7%_Jh=EtVXV
zWtx{`tXRABdpuKPx;Lxc?to&wQc>dATxcP79>iLH_RDpz4Z2yE-)Cy)<UJr%Yi;F-
z*-LJuf?sh|Q3L2NM^a>hI_dd0gT}yEF=iaSNsr@BH2CKG7ystmw)a~(>Xah<Q;a$G
zNH^5Q8l2bg+DN-vF&XmbK!fH4<QrAFd)Kq|yTeB|_8@`;+4*~XF^~uT$QiJ*u_-Zt
zN9q+1LP*Za=<4cPTHZ)G#}(GWdgI2->}>Q-(DUb|DKy~V-SEWnXhj1WRx5DgV|F&_
zz_@L}4zMi2FEIb05+;~N%kNz44e?Zs=C+${mY>siILqTZ^vXnzUn#~RKdp<-D!}Ay
z%2|hPl0UYkn_e1zmN&_hPa~nv<_Ly`KBzJgMI*Zg3eU&N)`xq}HtP!0jdpyrHCPR)
zi|ja(I=4Eb()yTs?GoR?{Yk#?0a~6E+ws1ylIZHtGFuI6uYHczRvtOIV1wfS!`NGg
zMWKD)!vh8?2q@A5N=t`yDblGZ-7PIKbczBJBHbV@Ee*pEf(+f#rF2LP-Mk0A*RSaP
zJiqr(9~?Mya__a*T3h1MJ}a?q*%_t(7q_*mJdpf9su|`3tbJdAW#7ByjXs-cfaZe~
zJT09l{e|t^zBuSFdY<*J^0M-S39mM_DePjDOu=THy|Omfu*UPg@@)x^lxVS5(<j*5
z9DK7wc9TZK=wXheeOZG=7CE_NIPN^)8itlDW#nYkX3M<8d;8=$+%>}Lkmk_mWHqTL
z7%)!65cvS$Nc*O7#-_^k(6?A#yRN{Qv)>~gWnEfkSi0U1y{-rUkv%iN|08=gCMo{k
zG}-p@eVN4&pt(8ExKq00!A_dL#yyDG$+f}Sup4x`Ja*%A!a4}dWE<xf)el-7EbBhI
zWb}50W*OTjuI;UMut&+2Zfr_81ckb_J=TGLx!Lf^Dtb!+H*wJU(gaV-7rGXGphk_o
zJ=)mNXzO1-$<G+T&yI{dv1hA|Sg99%(~*8TCPiBMy0d)q6Bp*~b&C&PdVtJO6qGCW
zNLuW2CWqF>DDd+E95}$N=W=sqNk3XUtvMscFKnn?+F%FV_}CN4de)N@!H$l+#l?b)
zmk$&bSHgUv@&*jUkVWgc3|L9)w{dX75)&<#3S&>AE}-?6FAo;Xx0+TV0Se^f>nqJ3
zO!*&FWwAIU%<)j(Prab<TTqSu*;qO}w*<GJDJczT?XcdrTasoLsI*(G|8z#nqobDx
zjM%U!hj(sYme+=0+t%t@j}1~mslMcmd1RM7CjrL8hY!uwitI=V{s97>!_Bt~9-+ej
zmC5|hcLR`U0=6-y7AE21m#jeyBI{~}5S#sEe{)xywI8c)PR00J8Nwj>w<?71#fbkZ
zLck}wQN-LYtwkJOn8v;QzZt-~if612LsL__e`y`&1BL&u`oWf!?mq-z39i?qURAoQ
zC){n1$S(ZJ<NE>N5fog<4Km#aHEh>Yx>5Fb2WlR{tM4<RSN?;SQRgTQ%tMIjULFPF
z{RgS{Q?8LOJHm(~_y;`yzeph@$7h0-Khs}8M+bCp<NuMPK|J5rIg(%kI?WxSRJ8y8
zY*Wh*DGA`LXXltfc~1dIEzgf+IVQ;^r`ia6_YRMU7rHrTP$l*MkEMA=!;T*YG&!t%
zwFo?d1$;s$95)8W<IuE{T=PL4++jwcpBjbU<uzJB>KGIVX_dFMw3L&3%kb#I14z0$
z&m&36mK4zdpS6fe4i1hkp;K2?)fl-<^s84vv8UuDlH!*H?%xVZJ#F$*OP2<s%12PZ
z__OKJT>sOROlEryyKa!tN!O*k!C8x0S{Q|VuLIZ&$Me{xAoPXz<$3AT#q|$2T>!;l
z6k4q5T4Fag*w;V$Y!bPDC~)1_w-+3jGe9yqxs9CMDD3r;-AU}pi|2sdk+a0A#cQE5
zWRqGb%J>Jj!p29JL#uZjL@0SmhXLA0F9k_KSpm4f{=_Z+J5;_lua$3koYK-;J~-w;
zqMAa7TdJtZU3L6w5F=^X9hw?pK0ldfQBXN#ou5};9CaIdgh4i`jv8M2&D=<FRGG0i
z-19X~fzwxBdvn^OIpK8cdaR?bb*Y4a6~U$wJbY{`TeU0)i02vG?Gw@i6-zU0ViTh$
zpAwTL3cwu=gn6o}F7$L-LheUfG3=8HQqR7`FD$$OT-{MehgDbO1y(INb906_>Hkjx
zrzqX>Z~V<c^BMcT3|XLv12U7`I~;dtY5ks!!Cw6xjpV-IomOe8w3nXj+$)F^pmxqy
zxIt*t;KiF|8>_eU;Z<Bx)yt18F}Bl(lTLET-s3&BN%aei{kU-I&)>^+!oLDqJ3#)*
zzY;IJPo2^+R*Z>=KO?F%=r6o4D6T$?TdE0=<sTZ==N`1D6}Kh^ar{rx4%K}-Qz+EJ
zU`7@Y@6x?V_<C>RC}SuAMOCr_h}Qoghg6<EX$2ueo?2;kMMX5@GBCu_sI;NFXZHL#
z?Xh9H%8i?YK&~0m-6}^5s!s8L(a>H7Jrmd`JihL-I`EB~2c25|Dv`%7A|?Z?s|wBO
zKzzyW!0cZ7rmOg7(>!y3nY=V+aXIW~;<sF~&E@F3AKUSGbE_vYGG7;_@*WQ`@n?Kj
z+3P{T^wt*2m2lj)@`OnBjL}d1?kA9C{sU358o;Rdu$;CCX!{5!N7hmJvxB!KOb4yD
zotjKFYON3cVem_AE2@!{es)8|1Ga%(yN0sVe2a~=ZJd*5dERsUDr|hK)2r6Yhi2v$
z@I9@_DVNz2L>rv^-eOH=lAK;~?61T~-%W3m_Wn;HP+IIc9gwsb(NAV+V0f3Lpv}yc
z2q_7P=C(HdgWq*PyfS*mchpbD*WAU)JPt=EL9JI`saWdo>16z<+5}*gjLde<<Fc$h
z*S5@{nRl^X?{jvq*T9B!NKXvC^bTZsPc2tow`Sz~VD^a?^LKF-B%1PBCs(CQ<9Qkq
zUQQ6VsHVCty(70+oltGmm0P_`WXi%(kPZ(SX}Og!j%_c)DG1dhC}?}U*U>o5m`i0>
zy^15?!slEn1oJ7aJ;>@>BubBZgn9}^ux2MH&McrYT?S;!#PTHpBeCY2hQ%H+lL~sv
z>V#W80IHHO16qR_AIZqJXOVqBnOxt8EJnT~H`k~OA{A*m8fIg@giCUebZgr)ylyEQ
z4oLCWK*%o$ZQ@4Fv8T8!$(NJ3%OY27egwsg<C)n%Wq>~gSQgdqHF1#&plSl(d0_0O
z&`<ea!J(Ax5AN3zu-I#c?weF1Lz@!$oq835FccPiNuB*_rtaCvYrF5y-WY{n`!m2Q
ztII$+{D}^Kr!Ni*s<zsiP9BWS4DQu-NXTS}v+HCzjt*gM-FPh3{FF=DEF^K@)-t1R
zktt}9xvRD>YiQp(6WQ59A4+QL%`tj2$bghPX5Yc3YrQOFbzt?wlIeoy>d5OQ#NqMx
z$$vOB>9>N_@xsFYfh1Kq6~<12FZeKA-#<(8aWspj@}EYy<po)O;$r#9qS?j6xAIsU
z62TK&JxcM@_yVr{!Iv&TCSx5npizAiInwP?m<1F!nX~ZGQr#$7$2$zUAw#*iAFd77
zwM@H~006rv`yV;YBr6D-^6VwS6`^nm5Ws(vbNxn+S*C3RJ<M+3z8xPK3ASrfCR1NW
zxSU*>0ld8=EHo6zTHV~-&fLAOF)Ge}`>Xs^regvkb_p&ahuVF$3{!K_d@((Ttw*y{
z-+BSQ%F#eW%M|@N^y=N+d4L=*+n6p;R!2D>%`KziYd?BptbLGOl^$>!`Xmtmva&QG
z1BS}dLo!Cv__W6@2AS8?h}tvcrFjypbC+GwI4E?!eV%wrn$ay-ezC$4*3D|`$cYly
zSa`e^EdfMVRR@)}Uh|nS_d7<I<BC2@2)?^>tblh&)bd?B$N*8aH51hUR~D}5?f>Xn
z|FNCMFlgqJ`l7i(UwS2whsa^(va05`>GzbGt5cbAnyD9j8v}%bM?IQBIK*e~48R+6
zvUh;KJC=S8JNqX=G56N(QbgNepQ%hB7dbgOH8r(>!1x#aMNgUN8qHdF0T1JXAOxS~
zoyS^AJIA%Ab9Jr>>n}k3cU~Z(t`p{OsIFM}ro!&Q2(@ff#{(mwXT9qT0WWN(4N(k&
zD3{u9t8l)7<t@GZW;IzfbT7u@!Ip?+8fV$7Pp<}AFPhNjxSAyjN|{Re^BG#u^6`vn
z$MWzWjvGGBpb_Fu5dSzF|3cL>mMU~sE88Nos9W<AAU7QGo6oW6$44>8IWh?FU~QUG
z>Z%Vbv*FKrX@lHiwzxXIAoI{hlT?cVYgc`MQWck|<(X+>nEbQ3=vAr4;}}i-!*7GS
zaEwb<H2!{$DmmLOb1^enPiX+BaGujLlmA*!)XJU4d75I9G?iS%U(!3EXIB-8V!WLH
z;R8X;8$i<jqb73cppbPOH|7$%ESHKAU2|YaOP-y3(VWy)1Ti!Eoj!3*^B1GFYE16^
z!j~7=CZN6kCK>8f($!JEx)vsH7<yX8`ugSF<UBeoQ94!|O0IJ$s2Y{E7iRh86cF$8
zRNapmV@v0<Aa&YEf10OKl|ujxQl={$$m!Fl9#ba59*~$igJ_wm%#2-@`Iw(ywiTW$
zgiB_uw!YriJFz#)12C<oHd%y4c;lYC#YcneGB)rWa~$*GCP~PSE2-?U9@36Cwwj8j
zb~+AYD%gE=*|iiBss+N(F#96I>I^<*lhP5O?F1|aEz@a45TNdsmX?NhG7RP-V}S^X
z90n{4a+G(wE=<;X2)Tuey7}V%nek%yoA=!Fy~HW&o!;w~939Xa_!f7jrWG~Qjx_tV
zptBruBX$+1VvmTvKu|Ap_MwitTD`8BPx@u1#@0A5)tQ(#^lgZj)d}-_assyDBykBh
zEMW2xbFCxeiIGffTJXz~gy3!V?!<{);ihWq$`q3>51pc!4{aGScgr!z&&XJ%+cfU*
zd{yOmD7k6pzAdS(9xxj2e!R-+PTz0SpE?zp*zCu;foQwCT`Cy1zvO?LL5Sfkoir%v
z%gD;k7KZ~FiRw&bZ$m?ap%u-Pg485%jxU45<0D3j45b0EXrwbuUz5&m{k6`kqJsNr
z1l4QzZ?Ey2ZT8hefVw)-{lH|g&E0qZUn7TKn62*PT=B;porH5pbpT+~(r=34W8pKU
zY)K==A|eczo>xoC)K?-$dT*MnDB0JAc2sayW98!EwHX?<`AcxwMwY!=)1x|8jI>mQ
zW|dWEmbQJT9;)L7bTzZ?`)iHIh182Xjo}hDqe;&{=y$e2%RM-4=BYx}hn+QnJO{QB
zY4xD-LM=m_IgD+%2+^NckeVQB=X7tFT+7sZ@Z~HK@dV?RTln5mfS$4`uj+^vC};u^
zrZ2;nA0^Z!GdI$r?Y@C8Tc_7H6+cs4{xg-r@K3)&7Z=pNn`)W^)8W^Uzh_YFD!>>p
zxj`_u?_kz~HQXuRo)p7m(`L9z7Zjqv$dt$=$c6Ti{T;p5p0U1dmX{oT96ksaFr;w;
zd2$8pvRklZqb|jy2S8(n*}}NMgD>0hyt%+ksK1qSb*i?tC?O_>Utg~n^_g~)%doHu
z3|7P2i=6%DUYXz$WjFod<a%UOuF>eda!|(Ujp5y3VV*3Lq8R=#49LHnU8+Q(vz(*n
zv_O&dU5Kdh*8Jr|fpor6zF<)Cf<0fX7U|u2x78b?iRuj~p@wpngZauvUjL)+<y}ub
z8+-EMqhl6B$aguIN!YOVJrlo(OLCP{p;U>7lP5tkdy>%?;u1gG`YW$@J&Fkj%k7gc
zo5^DiE4u&Mg~O%eW*nw$-cn_E$G}12{;@L2{D?~PNC}l)VZvldv0@DoIx~4F(^dPr
z3wV(~nPq{Ie>2N!#BV)PlcI3q3&~}o8?t|nA3&f!QkGpyMX%tfy1e#<LvzwhEd;o%
z{b$YkC3J{fak#7{K4WSVlmv>(Pb&5+T_(Tg1YDmRK}@BG%hk6lz>W4~qGxF|r%icR
zVjB6!z4<fq(NmxHNHV$hi1aqg36n1KG>+06yksCZ1u@`U*MP^JEEEQx_G}6+BWMpZ
zlLM4@&dd#P?sB<<_4}yyI<LPkavk9u)Ev$=vBxow>AaqH#&Gkx+K^a`SM<6f*2}Pb
zd|uAZ)k#I)(x?bwu)PMvLL~IbwIHHkSxeuvyM6i??+w)=Nv%$$bRFefe%gB=F4}X}
zq#sq%w4Cp7DC)>D05YK|Q2Vy9xzd(P7>(XRV}cdJY6RE6`Cqb*2Hyz$QQnKoJD9KZ
zxdR>iqTN#02rmDR!ZVAjic{fAbAz^-f#D5i?0_@4j_L)7P2XCyG_xQA@o_W~uKfuU
z5D4bNhwLeJtBecv>DQ=G>sdzvqt?9m7w}s3F9Rlp9~Drx{GWz`qWkNO%YSNDe6Di7
z9{L_&Ax^L(ZDMH6j2+%%1W)7|Uu3P)#_*{@E28~LnzM|p(0x?jn~si_^IT#Bh%1rp
z?0iwo+VRm9^*#XyTIWkrrckN?JjSWDX~hJfM4wOcr~&t5W*?AYQc_c^d9;PV{;Un4
zKpy^}xLwhOhn0_&0PD1_T>lunTRQiGt2a`A5X21L0M6WXx)eYT1G)c=8(4i0o%VC?
z-hFx*Pym7Di*%4!s#bxq)3s~Ya^rPXfWc6a4RWf{?p=7eh{xu}2GEUfZf*iVKigRo
zZxt(0nczP-JvSO18&0Nqc=uW?xYRZ#lAEz`p5zUzHHQcRUCjVHmgryJTv)M_xksOW
zI8?-9ssXtyaIamw!_Cc2IkRDvl!a7i%~dV7<0WC1xL2eAU=z@PE8G_o2oYV~+&aW8
zv#8Q00n7cn`4`VotJ(xixkUb~uXD8jVFloS;rLp#VK^~!L+&o7sC%@Z(lfZA!|HW!
zBP9l*|M0mQvi$#-B^yGa(0|k9YAqqF2{>S94`1b(DF>Kh(H7wG0j~>xfB%q}(f`mz
zf^>QWyWWwIZB3Ek9{`E`Mk&=o_7l3Z8n$wtRNoIg36J>D5zxWuh?s4^fF@TjU!o?+
zMI|IuTaNP@RVqwViC_O|c!ZN|W@`GuPbo*>pCnYLFZ8^e!LZ|wIwlrYR7aE%qAkez
zzZ9o0RXZ@nb5%PsUaTon3%Lqzo;iQ0i2NVH{+Z%R-{O%X7brf@<mNIgqN;7E8~`l1
zIf#pn&a@+zBfOsGE<4|u?{FoP&<_`i$AUb=z5J%xuMdlj?kq`*$y#DeE2jRXmwRDD
zeSBmh&V@po=A4DjcSWFkd7;eH{h}NBOm#sz)lLcL@G_-&hP5@at*7T3CtjhlJH}6m
z4iw5*0HoeOIR36srmluSE*6$|BH<@^xVXrD1OwKv39!UcQ&R)}A4w@P)LDbem91)Q
zTAbp%<m6K)hig`rmek7`A_N8hk&uUhupIy4u_*?{0*>mQ_wufBnmw1hnbiUZ|ItIv
zzM&n3oz^a|3uI>&Y4<h3D88wxk_vv~`77ohd`29k%+8YR4EX6lIA{k4!k}`%RQUSI
z8z?u*CcFMtUeNdwS{)}P+Sonv90y*V4L97B*eh52a`Y@~EQ?V0e==I1D|;&I*B$|D
z8)-RrrC0^EgY=%El$2i(w~>ofp-D?G6kcRr6s$aG7KT(1?G;EGZk<R=O9NICqigez
zJ^vt5zWnj60hRK`pkw87QSeSH)*HY(FY4~jDl7O}t<dv$R(xBv!yJo_cL?;l1jd>>
zE2H417%I~0OUNlvvyhXMv$MO;TnfCx0XuhOL`}*#(rmy{R=Y%+MjI4Encb1-bSD!o
zYlW&EAoscQm*|8B)U1FJnJ+~OvQ;BWx8TUYxVF?1n69hem!?zr^nXTG%0co&8Lb(7
zG?^J08>+29#KK@hKl&u501o)G`hP}|2m|Ttf0d566&5lRo&DeB&$Ui;x*Bc&&p`kc
zR<ZXE)?`V}`yX<j3l!5kkpsu}438r2Mj~UOnwMvM@5Y&!>!0Z$t-4FyM;-=XjUEHD
zr1N<GkM9Jea7+^?I8y_C53S6m8+VJ$nVX?e3t2+r_In(S7TArBgXS&`q04M{gygRL
z8SZiOuxMEW$BgYOachL6kI;aG%{&+>1wPNOI)PM8p5g@0KpBSL-T(DjLh5PgA0)zm
zS#UQ+ejp5=t@Bs^CBs?0_Xc!)EjqU*ixKmJOKlt%m_|mQtqJOPax>Qg_Lf#dC6G4*
zhYlxA5ZtNjYh0iKZ%W<m>3mxngKMq%Z?l)Vori#?7-ZYy^Cz>MvuH&|y9Ax3B&~(+
zCO;p`Q(U$$1h9*|&*?|{`(7v^t6%<)C{GSEP+7yL!BJ8P5D(*q--mbVQ;}$s8a0hs
zjS3A9y6q4QO033D7KE;t8;`Vybds+|%~R#XEUUEw0LnmR`dcHB7<mWL9RNWwMhyUA
z*)c5cgYHaqo|PfSto<jzs^4J$n^pf~&9aG^gC|3;*}FgaqN+GYbic#wQ#FHd+w41I
zipG^(Izys+*Y};7Rj0G#>fg4=wdgeMKi}7w3fKq6#}4)Kp&G6wvGhe!nIJ^sp>Nhu
zv)!gN2%R7W9vYennjEKgz@a4Le%LbyQu@_`g|Vw9TPi-!92VOr0uCJrfU3IIaXt7^
zr9RLGG*J>Hsi6KDiNxO~D5wBI+Zk0)+oO1HsXvcDW%P;lyDp2lHw^yLOd}u^#g|K}
zA~OzAH@UG|&JcoTm0lbIyl|Z-_ul~&l_;zcmNpfY*Y9JIQ?XhEbA0UF{c-(0FeY+a
zNcPnNtv#zuw=lK&@46xz{Zq>PQ@+)}>Rdv{OTvQIjLvRD<xgm2uIr_P;cGISWlz?-
zuNDkD1a1;c4mxEWgH8n_{JSlE<1}6G{b4l%hJ&_bVOS)<Z0}c&_Nm1S=^p@WMbg7D
z$_g^$s6xAoVp<g3W_7hX>SBk0vQ9#|VI!hk;Rgyqn#JCC$}~8;gm@Nn$Y^yl7=yi{
zbR*t%*$6YCY&!wp0G1P?QnKYA&T15>j6!Hd=6;fZm8i1Q5bm;edo@RWV^oM#U~?sT
zQozWt%7lgOV-iNk`l+e;(7vUmBozsKYeC>@52m9_--k2z+P?;AHr|)!POG^}Ke`&-
z^waRdFisR1Z1jz^E%x)6VyqWByS7d{Npv3|m}+exW}QY>7!$PJY${)KF@3$eFd>aU
zyk(-eN)YG}4LoUX#*Q)-I`En4z$wsw*LVN8f}?cL$&L(deg4nHBBslgQ}YG=S=Mb*
z+{~Eias1J@T*ljQXxoTF+M&G_hY*-e>0IO<1$k<Vg&hkc?bp6}QnCS<d)Z15=wyUT
za<q>`bGI1{dRt?McHwuS?AMt($42m?ZmaH?mC=?%PdO900cIAIqpmH^hdyL8ZfHf!
zmO$EW%q(pfYV~0I_{$Xie>hNlk#<u``*@{zTY4^8M;gOgk3A3{vHT3w?|?u5|0&sj
zJ~qHsL;X3BcJT$D=*O2Q&Bs?69JFqJ{NWLys|7?rVnt7i4+ATUb|-(gh`51{xd}==
zFWQ)YSVS0AmAVU&94xgA0hVOFV}d`{Db#s?KzY%V^rFC9-8{4X0Zp=#Wi#HPsDt%-
zG8u=hbUxsB{hOoRj|2akt;3+`Z@d_`mitxV<pr6`fQVUEHqcuJR=c@4;FNNL-{kQK
zZ_)J<pS>$Oa2Bmh^71sBhE9jYJHVD(I(ULT7h32mhvv9S1kANje+kR}J-V?d|Kv|;
zZHFx7<#vNfx^337#?)q%q3x)}lUC}A9<bs$_AXWQn5bSSn4DW@S#+py{dc1KK`WPJ
ze`lK%I#;KV5zOPzj=vhlbG+ibuq$<8`Zbc5a2PF^zjXc%2_<6eWGg!wOvOalXea0W
zo==%w0)mXRbv;q5z-IV2N4+0|171VFbVs^yxkicAB#h|@)W3Ginf54&ee9&ACK<jh
ze(T2^ye!Bg@fuIlfh+v|djo~14T)8Qd?Et@GX?mXc8}TLn`*JxQf5EG;|*m#!A1Rh
zRY3I}cudiJ&Kn$B2gDL@Yy_VSy!Xj==G*Kf7axHsLe4R$fK6dqcIIrv)gIk^cM8`S
zNT7`lypH$$j0bW~Lxs<Vc;=0WuY&e((e+o@;|2B(3h-gn(joa}$oJ#@`C(LsK7FYU
zG_vr5{-Or_e3##AqZHRU`d%eqhC~AX+ww2|+Y2y;sWWBDLvR4l&ny%F{Nb6T0L%mP
znTa8K?)f_h9mqK99ChIkZaf0!kC?waEB+0U&ej+8%mnh!uL%xX=K||Q2+j0LLs#wQ
zC!99-Xq&Z!2F=0H$rI#4j1|`LYGGVK-1miLd)~9HaBO8+CDKH(d8G)8)XB8#A+LmA
z;$d&<F89G@$Q{n}bNHhQkB{J+)5VI#JiA3a+>g9&e6R+a-d37!QODo}io#O%OMCz0
zN;%UJoU2s}FUQavQMNRSo5fE2G^p|)T|XUi!C9xxwmYpIy8A#__~K>N>&?8LL@!aN
zaG4((GBsCVzj(YVs^nG4#vu2qcG^1yrC<WZ@aY`+`->ts#84Wv2~>?Prrb32eU<R!
z4n6iA^V%r)Z|#NIu)guE)upG~$Nc>K5vxl}Psit<RVTT3p`M+OxcaGpH`B&8FM&~N
z8W)C@bJt!>Q|GN$n<OYNSo}5_FCubSo?m4`L*y_qaZS)!kbW40(OYQnRG!vH2D`0&
zmwhOWX%uPNc0z`SASTy=+lSQfn6)1%mOM=2cRTs!)Hl~9Z`^cZf3V<#Ed73h0eMXP
zc^?mTT9KgSVia&l$B!bmDrJ&a%JrbJ&=?7MVxv<PEI`k-rjq`M%|fF?!1}n?X+JB}
zZBj$PmSKVIY(I4R2stqZk_0ZUEXbCw^KwEkFI(5Vu5-k_%=l>*$YQ8O5Yp}{Sth<V
z9C~evAMB2oX3Z{TG99l+bPMYS+ZZWs5!S4C_|rGpE6XlD&9YYSm`=f6x}SwT`B7um
zbWoZ}W8ZkE(Sm+<HzhN>#N~r)2lt5A30%o>X(O4WsHW9QoGR-}iDso~Pf15U{WOI-
z{n*fk_QkVvg;qRQ<<`}yAx6ylM3Fr+IODWoJ%c>wcA0Jig!}v#k9tgC6;b7^WEuIv
z<sE#t9ziASJZ!c2$u>zYRVbCzF}6f0ES^34@cp$?t6F4zD0*(u%h2&S=9Poka(Yg4
z&C+SB<{YRM%^Jnb{zxT%r0K}}(;#mf)iPOme{>mU`1-auu2pSv%^*|aZka7$iRr>I
zeZU$kqpx8oM3-HIm_3ZQy33;V`l0j-5M4tH#~ek4w+~*YpijDr`AY{sJK6T8Odq@^
zpU~wfUru~)HAqOnhauC|Y>TihMOw*q|M9@KtrPC@WQ&rFxZv)-%m^(RF3$EkiyTGN
zQAP4h-Oe@S^_=lT6);cr=^T=Y9P*d<M)qHVrO3xi$luGE<B}Z;!>f5hutpNMznNT~
z;T?dc>REt`A32NT{o@UjSU3Hq7Vh_{L=mEbKIMw;g`~q!d)Wr8;RAp@mVlGQel%~%
z6S!_NQ)&+1UdR2lyj!dNV>pe=`0G#8iuzrGdii1MaQpnnQkr)~H<VU`%x}l4PjjWJ
zK81QiMVVk<OmgVJOs-7SWXQKPVuJ2GfrBBp@U*|jmHCOUV@j{yPxr8KoKVJ?x^66F
z3W>k|=NRbUHVF8}*z}v)H0keuCGW_QG*KUa%8QTtMqBVLwu$rpc8;=Tjq<E1Ja}C_
zBr#%AI1Hu`Akg@w0~Jrr8LpQ)kSHX-L917}uH>dSQFFuU-IMl?p=`TMS%L!Z8)oZ1
z!~Cp%_>i_gR%*b}5Nqq$euUZgu=T30Y-`_AMks7Cp9;S-&sCbf<La>fEKek~Mdfb$
ze66EzUvs_JrKsTVUu46nu(PJuiJ7Zbdl`jp^F4iUSLKc5#m$b{^4dIj@brKNOM3NI
zS$Ub^3=5?5uQRnQFQn^)xHo>1VphT*5zbvffYokTW6!j96gxaIA+hDk3CrnB;s|~f
z73?3mdvAERV|$7`F?-4MM7}6eJrQ?zC#bnOzb%+{d{qZ7C)!TZpKdsPDK+Vu@t_{7
zpF`tkFHJSh!ox5~-PJpt<=6xT_3x=FK!^`hF@xhy5sL}zVOVGgKr8l^u^PQ8%VWP#
zsVuApO*B*COIkvwp`vSsPBVGYo>G+VPF$J}hDC;X!i~#OOfSTg;Zj_F$`~EnjP0@T
zn+0|Xr@d>vT3U4OL<K!i!t_xRu<Fe?O0QM<W*@@QCP=JyX;3^LJve_lTyL+*qE|cb
zZ`9;P*}J(tvK$qnl)nwte>uC_wQkpn&&ibBq#7z3t1P3QE{VgT{GspFAy3;qRo!AR
z^$f%N<u*@x6h{O;2m7TuExQ~Q@T5yKg@2!z(a*Y@jEog`cDg&cQED}Ru{4;<^*HU>
z^!6Mquy+L$c>RBm{pg<@$>t>K<MR*>QMq<*6G|^A(e&Q+=>c~`tJc^Fk93AgTvQ)?
z<3Maf;X$mpXJgKN6Daw(OAmU$AoEOMVYx|;W%x`v&S$gPqr4l_ds^d?5tqmi%cEc-
z@U@4Xwv2f>2nxcmJ5Ac2wdi551%9f23M(r=Y=G3={bN-CTAtbPB0@>i@lO66p}ouO
z=M(9wnt^s_YBQgn+SAxF(_N#3T7^FQ@6~qk(k^8SY=xB7ifb$CHA*D*%50<bnQ#4(
zFu{yGm}+Mk*7}TMdWLDvSbwozHJ_t#M_<f-t2~ck`|6UQ(=->kQ56)lTx@!f^kT9^
zlwVhIkLGR4uIz20u*u`m#Z+R%VA0{`UqgIV@7U(I=wE}0a40akHc7+W@>SAP#`?iv
zM_JI4&qFyKc(T}Pmkim4|C<X9@n3PLeHX_U1Io^R6VLfCEAk%cA5u3jm|$cjS??{x
zuDrjlRl+Cah4dlnNsShXvg0~RVjrMFy(99+=VFomCMm6gXhn2;wY(u&yD8)QR|C1y
z{JvGiW==lb^NmWCW``tSIRAe229`_3q0yDEX73lK6zE@hE}+%%wfeb(JL>dxl8yE8
z^{%e=E3;xzMi2(HKR-G6`l2ZuW4Q6NsF~og5zR<KQk~<Yi;AKL=1#oU>OO013%wW*
z7S3;OP+yY6Mo7O!?pBLu6pTq+^R=I+VVS_bi#3@ZJQ%t9d1iDsu(tAu0CxX^4$t|Z
zfJJNcYO|x^;H-G}>d{36@ilX@njHGexOFJSQ5UdH&QHhQurJ=L>sFI~;N1O~J~d3I
zxj1^R5kA+b#kjH;Z4(r+c>~uyz0Y5G58n>jOLZWfZ8a~ibjs_%2SP!Nu@Q<fFwVqC
z%uH|`&e-7ahS3FUS5`gE38rW)M`oeS3i&g90V74~!fQ06Bo{{}R@onWb4;$JcHtn*
zu2RN$8@ac`%%;s1+oSh$9G7fN(b+HWmK_$O)Onr%26c`WO}@xhZ%Qw!I>)xRaJ(CC
z(PV+{y@;5Uw+Z(aPBvUL2yU)1F~$uYBOzZOV9%=qw!UunO$`9zXzIYrrmK|MFKzWH
zTi(i9BYLPne}}ZP>XLJH<Rw)HNjsr9?(T^<A(?JF+h9^m6BGOCsRi|Sn=W{=pPvx0
zU~h$NIS$V@AvULkLsG&%D|0n5jv^G?qC3>>v+16!#<+j(t4jzozLNqugC($)?(jd}
zc44px=^iKab{VydHme!k{oZaEOEU^TiMZQ5aRHnB&$0Kz_*;na%Um^<I%iY$vd1Dc
z?v`nR$9-#l8(lO?{(J4#ag^b`2M?B!rkB@5t1jri|G1}jc3QpFUM=KPGfLghmoL8>
zETH3H4XobJ?;<8ZZX8;w7VANj+B(e-wlhKJKdg3rTy6z{<|LW=j@Wxd+m*9lnoNya
zTs6XsPo;L+itXZybK$G``EhDk!Xx4wa(nNpZg#1e9U0wUW?Do>B-ylziz<T{^VkVQ
zgCN90hhcmoWPWt><pZ);x5kIY?;VFx7(8SM`P!Hw0(nJrE<}yOCbW9J=L2r<XEjC|
zm96~?U1qN{naD;!u*xUoLZN!F{?P`Pt|g}5R5Q`Q3+`$#52DAj7!5=gu}z`{xbfLv
zN=SyOD`!i`aPZ4L-190P!=r=*g7&{JtT3${!K8ASX7!gecnZs>Rv4j;HL)%!j>o8d
z?AftjG8LxFMwp8~J)7I;L6$u9t+p|Dc#)*28i?=tj=Z3X_z8?&Y+bDWq8fyo^!%tF
z6EL@!_P?#Yo7^v;RphdnuBp9QwmynJnR`b$w|CB~HLuDv)K;l7$9UH0O^J2&460f2
z*QKn6Ms1Jg(Ftdww!<q<)Pw_c%F{1HBy3R(R6t~zEtXWM5jb@ny&#dt=$kU=UJ%kU
z7MN$1KSpcECCtmtF0qb^Eq8uJ7`U`TYOCG1ZCZkfr7+D4K$tyJ9c=BRv~_}86*3ih
zdk6vIO8Hb6R`%mV`-wBrE2A>STasw*(?-7dxxABRzYUJ567?b}-#lk4ZTGuXCDbj&
zb0HHs`$C$v8a6!hRo4o)E2P;ap)ISjToVhQ-FaLX-aqwoUME~pLakIt<us_xxH^Bw
z6DfuXm~>czdpg3x^&_Qbz&qN)!TS3vI|p!?OuT&sbufJQ_0pa8^G9CA5}vYa)$&QE
zr&Ptjj(MdS+byoqJHC-BhC!?L@lYzkefnCFJw_qg&~h6qgX!+E4`_+`k2BvG5#-YI
zmw^~l<l2az&{M5_1m1K4r^U$UH#oqF`wK9>DI#OEOUumMU7PgijOS_axsKj2ifj@l
zc6Vp9COzvNzdLV0^|}$i1=?uvbtVklGi)|?%3(L*4WWFpi%I5Z2TAb!L5d_Uc&ETN
zw5(jP?oC-TMx`>K95UC>A10(ZR3Wm;$RIY+DtPZ^t?TXKn8nGC-R#F9imH2gw4u!y
z?DhJMiF!*$C?q>$t>vGIO-QS14mR)R+4M`vj@o_`W#i#SWdpvoW1t6WflA?28?)6>
zEHHcPF3_(1<c&6IEv}<H;!EZp{r1INd(2~FwwAj>CwtTB(ovVyfVXCpRxB^nbg12R
zd#S(naD45k9SMv_*5~EbNPUW4qxFunZYt;PTauj)|BMR#KR_*KFEVF5zkG-Pul1;5
z4=X~pHn`2c`ba%%0(~lZ6^L<4rsSW#YO)C#(wrrE)#Q(YKxnL%)<2zRpXlqzZj^uX
zr7*as_MxkuU?50!WjNT`pYHDY-Jk%~DBSM2NP6YB=8PCEE#RNCPFGfE&F4Z0N#Bv~
zF7?whGFsHXx@sTQNCXUeOb%LugdKdT9Eb)p)kPNbeK;IZK3+I~^1ywO<>m%ums{My
z*Gz@7IrV8;1)<>~mQ&kP=Gm)qj>6C4WRv30*YXp28EC>}t4x>e1S;g)o4Cj&Bqdd0
zb|#?D>B(M`P??p7ek<sx5DzB3>GS7{Ulu`^)*R45L03X9G^MGjDQY&TxgE4tsRW%j
z-bJ!=KU*YRrP(NHS4*=ez)KAX0oytLuj%8@M1m`OUNGq%X>KCFNX}#AK{|T;M_gE2
zm{6fX>!Nm#iz=xTJ@xyWZ?p^J#xzxP3GBWt2QNHC(oOqUak%q&vA3ru7B%y<jWCE2
z`Q|CJFOnT=FIC%swMEBgqU6e0`TY6we4>loe8UOGTgL4>FgM^W0@}UvI<9{O2e0^v
z3!ja>eOZz~{607EhLRW9{<2(9D`$0!n3zxRv)`ar)&7k_+ohu146Fms64w*ZPj#*%
z^<@vzxFa?qG*k~Z4;pDsdYrm-7WO4>dAPd^dXS7Y#E|m8`0BZUSq+%mhikP-q+=g-
zFPsYBh<P_#Q)xrdLU}65#*;74#imu&W~k8l8Oc7}^4~L5FU(9ftXi~bHJg%>Q{p0#
zS)fO@D%T#$km>YV#<W+YLvJisqNNqnb<wvVA0J=Q^3V#VXe&CYKl}ljDbrY?rI72Q
z<g1e4*MwApA9rrBLhpCIFdSa-uJf3-7|1159R@vx;r9qxZDyK>i?@fN2`5NF!O*Z1
zGLdpDvM|{$^{dCVgT*m@hc!q9@l52DHS+2@@gk6KbZbv0g$73F`LnKkU&#}4($kv)
zCCJ%nzuj4rBom5ysKb0u5;cf)-RonG6SI+SFwssr5w>S?0~-K<8K`BW?1iac_^NaM
zjutNG4Oz8j#VvmPxQB3vJFJ+%DM#_c2fpxp^uoo*6U)lX+90`@*Q}ORPj_PLGS7=8
zn=*wpe>@U<I9-b@Lg^I)2E8=0u?Aa;kxacGm9?B^d&J55EqRjj^<Ol-C6SzfkJ}#e
zTv=vn`Ib{qGm<Kf)wx`Aa<IHsbC}Qn?J41W(@h20*G`8k<%_Ps!;{Z-??ojg!m{hA
z%JbAps)>PtVdndN5=KtW0q&*f!>|Co^_&lsMdo3+TvikO_wK#9mg>K{<jf}im0!{3
z$zcZL)C!iE5N?bdrsvc1p%Z{PID)s7s>`kR)u5K04X()xty!u3w+DyVRl_J_aEd|V
zvh?L!X#84E$r5={RutB8-BX#CcPz?&R?SycLTAh_3ALHcILDRwPnN3<X14bG%^G^m
z-Yb&1yO>_Te!_L2j~@@)YEa%d>iu-1Tcd+Ai86-&344r|np!M29qD~(CF(gs>a`RB
zNo^>FNz3xgOj@82+P6-`hTsJ4^pt@49HPF>Kd0D%NgeyCQw*twV8*Jetc%a4H=q4?
z$JNUDW*CL4gW_TfgB0KMIW{?wu;ozXh$cB??9%znkT$S9Fgkx>S?8rnu(G2S5Pj^E
zVg=0Rrdk`1b1U>E)}eY}){pzt?oFY0LR`Fk%Pun}#Wo2+DfENTQLRa%DZ|<j9tT3t
zRCcp^X~JS-=@3`hGse@Zw=w1`vr3ygSBYHM<QFGXrQ4|OquS^<UMbR3W2T{w{>M-*
z5)}~4sv`)KpWVGOZCH_$N1$bPTO|(I`o+iRs^=+ljXm=fLPOd%-tm}g+VINqAPR2r
z?>y*WzUo|g=foh8+Uk2;$5b96f-<19-P+n3P~@*~t2KQBSYyEVCvOtl+Y8w*jcdDO
z;V7Z#P@I1eZ#;+Vy`>$MzDfF$Q)@i!GlNk&Q)ENpcMC2L%&GZ>F*c@&20Hp$e$Xic
zbln{$edTT&rGJ9W3*^qh(K`b>XyEVf>n3~)P|dXyXqYXm3I@+wck&|&iv+E4o2nZV
zVXEsOa@H`CQMu%)dH${yiA6>R`)5s<h0itl*S!5iHciz?bpQEd1%;b!D!|7Q^eq;9
zdpjm6i8=lq-(*Hc##<7;0pEaDxwa<Nk2+oil}lX@KNr=N&_wuRya2XcM-_1O_c<41
zY62GW;m+GjQQGb?{Cz{q(P)#viRU<oYl2o<TTgeRMAjJe3~r7waE~vZisQ06V~jfX
z8aaI~u*{}AA@E+-Ai3uF#qwZ|A!Nj2^ut8pr_&qz<KChkMtUHD2pQWZf}nYGPl8gx
zHSfr;0K{LqX@Xc5K4R6<dNb&k>ycLQ=FOW8I>A7uHd9#kyNc5TTkO6@-#{aJqsSel
zo0IF|Z7?)RHld^SM+yGLA-!VV0w17FJqaP!=Iq1F39)<YLF_;)Im~@ElF@wF#c?AP
z3d&!fJ#0+5(#;N=<s%hhz!}z)CW{<1kvcV~lD_CEef}8#$bb;iJA1>%NkH-YF#za+
zDQRf12v{D_yn~B$_ohqFqCpcS<)nMtOpTpydteIp9(MArtqTI^%0$j_q{IZgDEijv
z<AVD$IHx!tVNa>CP8-Nhba;27ak32u+xl~qg9+Jvq}s^ZDy*kPd?F`>t4f(}S@@we
z<W|`fCFtriL){DB>B+3vn+!+8SvQ_mX*l+3;%zsRnPYespC2yL;|uK-tUWQ*mlqRV
z<6g1r8T%InfQ&Ub=5$cs_}ZTQYlW)z87e}!<IOqa%XRkWZ|ASdA(v}z2z^5~0G<37
za+iDW*Hhz?r8Z|Ij4@WnS!yE3MK`0iI#l)N3Z_=hG8qqN#y`<YbZ1~b*lo*Eg_npE
zP!7$xUUgfaYcBdj_6#g|Y`lIo<r2ha4fW529i+Qx2x3}u_zVnKMQl7*=h9z-QNq#!
zvhDQoXnK!lcPrRABPmet2>*pBU<TrcUsCv7NvJ-lUM^dapYa7l5oSwx-*z~%TKqo7
z<fS{I|Mi)RX#3wLcS&PPC?Cnlr#oplJP(#Lh3ez+^Vt$&2`Lq_{{{D-rL`<t*cV}n
z_HO{Lv%TCampmK|^=paYXji)Q;FZ_8jSLLDNeGzNlR(+Sppth`(Q@`K#SF{<4i6g6
zhWOn-Kk*=$1jwV3r8TpK20arDB)Eb=imcq#+HuqE`-^XJxXN!=FjGJw?<>{}Y%#7l
z(sKWkZhnrF-EKz&vD0m~gaDgj3zV;pfBja)5&p-OQ00*d8=KSDjAVyH{{a2Fg^#+E
zhy(HW5ZqYBbJpW`1wwtO927J$4ad*UDZr@|g(x8LXy<PFfVQA=wI~|;f!Z)mrj=$|
zo{!^AiHfUBs)BTCyE^+zrL9e0l7n(8Y5WJ7ujHSNHVACJf@AmEtmU1xks4iB#wv6b
zrJ2t5ZcPX|-e0lo>>LR_$I9b;G4sl65&7EHu6bX-q?6Uaoadq)%cOjXr?+O>O}E9x
z!V7L>jxUjFaAe7+BP=+A1C&1Wr3t$>*hJ)LGFQHsO&Z28&kLUnNX+hCP2S*`evWpq
zsrD_a&DXy9fUcG#A`M`Lv)Q73oK^WUAl;gDi!*E$Zxq=ka7}%t%#2@<;F4FtQqgq{
zfo>O#Ind9)+=samlIBcigkpyeQ9QqyuD(XbVmeVV!arKSd8f6b?z}#h*|*RIQZwqj
ze#-e<)qX8Ds)m=Ss<^aAwmGbn&4*y7O_T4GmnY;V#G5w(3c=7{^6+%COy+^Ubq)>I
zw~JO;`?`~$`00s1sCjtop0K<T`zS%3(>@SK*a7A7m2<O(-xj1y`K&;j7Pj7DoK~?V
zF{>>kq2Qdi{$)RK*A67T6MKF7%{Hy&Af0A_88k*=8EOheDHon3m^a$Z*d}#-;ch$l
zr;_4TQVizMbXQO*U#;YY$x76t7g-pCo%kB`+|bemyGCt}T9f+3lJ%haJ;wa{l!8QF
zjiYT}!~M@XaL+K<=77cF;RFQ?v)zKYKcQDBCcb->P0gw3EMvzbCy&u)Ik>mL1PQ@7
zzoTXm^lV9u#CS_=FqrqG%;tpDx7J6xl7>0EMCnd;%Bw87^l9C4m0U&E&n<%{1F6vl
zFj`bX0+^s9bR^`3GOL<!cirye7%0u`3Neb@Ytcb&1!da-9nJB3!7;U_9S(<k^5CyZ
zd7e9(dX-53=@(tP85*UrV@MUj3W-fqg%PFQ@J><Dsi<HIkytJF<E4nWHN{GzHHxFi
zfHW6$*(E8)om)@2&}A~yZeAnV@)yZSiY~Xh$Q=x}xYe+W!D1vC0Oq;{L~rh`Yck$u
zS5bBt(4RY1jNkY;BR73uXFFkn`j!9u0nO#mQq^;-P1tGsR%0Wq61GtS%Sy9^nYFyO
zrJwATvK8J@c|w_5bxl|B457%x#bl^(HORC~VJHknW_|TU!q7sisp5(Q8|*l`R0e$4
zV4FQv>fBiuiUZ--ZZK2GAj}++ArSZT7t>AhX~xo|kCy*bSmH%+(eS6eD%%G;XA`v9
zM@Na@BzB2C?B6doD!cfkrN+tt7hh6m(uxRpCKxwpMwNdT(4&LRxZl$p-CgeO%u2~|
z;`itc#pnr(k#j<Po%nId+n_*>E}cV+8`Y-suqRoTJ{gy2cg~Aog^o=)vg}gwW%6V0
zIJby~s*gKr{+10+$PS*l=Khm0_TU!uUY|;5dj{Id;`vpz;uyAU%0+?pixicHm4_f%
zR^<{?-c-?^j|^(sT^s-pm3P(l2BqQ#?JNLp!&Z6CU$8;mo`<j-H>}}Yj9W{yZHHxA
zpnJ;oN{iEX{MKKtTY=)Se5TBVL~t=>!2N>gGN~|XQ-HHv-L;K;T2yazE@AT{yCIIF
zZwZ~>{<qL#w=%V_AU@--)xFUxZit3hCUDKPEmgN==7UD(zXh9RIr0{wP7`wL{QOYa
z<=`x0H#}E;C~3%Sc^XgkG`FM8M9ayIB2v-oh)z}twlpD~C_Fq`qR1t+F?w~XG@Xf!
z(g&~2j6oGOP}!u}=Co$FIXLlNRB*yQmJC~tE&|Oi*JlpBKE$rUg1%16WpB29@5`GG
z>Y6d2e5E_T4*_!4(WhGh&_RA|<xSeD2%JNnd#2xGatXywPs~QL$Kw`H0XS2*+iT6V
zGMv)(IrQJeW=ONeyda*J4zYX?UAAPzGb;Fk-QuwAUQwWn4gSJ16~6yq2lKRz2(J#@
z^zO~d(;<hnk9(meOv+Ueoz(@R`e^cEh6ZgD4Y0;Y$51-1nUcTA0Ld&9Y<jSi0&g#*
z7<X<32;bm4hi-zmqHa88;&buLUldPUZphXB!UT~+Ik)4eP=k~MFADB1-gf~pxeojW
zs>vvb-T~);RN(0g{_=M-K}Q4X;)W>mHj;Lu_jB8b?UTg!zLJic)l8Oy>N-sRjfR6p
zlI2;kuqqG&K|g=-zqvn+TP&bnaU}oh0;rbc>WVhvJ9(cl)75+?KJq&?qVK+!?_yIq
zg^w~aXWf<&z$jqj5Rfbos$qEY7lJCmu2uNeB8&cj_cDQ1mZQw})RSP#s}2G#-?UfN
z>^471>lv>^u11m+arcgZ4bAm|<k2agy0xU9Xk^@JntX&%uY&FPBWKF(wGmRcXp|SE
z=QEQKBRJEU>51xR;owQ<n;(S^Q`IBuj7C`mGoH9IhK_<@W`hEAM_tTLE<=8Fm)4ME
z<GY6C-h0Rk#`C`jR5q;hkVTW0!!EDdBTwdq^4!qvKQ4U8hTp(Zq&k>GbckyhmhmoJ
zNQx`$Z%UEk-FupH{p3lVKma2O<ouQ!n^w;L`qk=Hf<1r{97qmhZgIU&E;i4bQ7g(d
zEsq~4euP_?CmD$z_?`ct4^)wx&8}SiOlv56!1KWM#2)`PL<j5q*f4kKMXiUp>ZgTI
zYu5aP{M(HBX$5`Q^DX7-eEh(%>ryXYSxXFrIbtV-p+IWRht1Z}+B^_6R|-MAyq=!2
zC)GYkRtutmmEI!j)IBu`N}sn0;vTUzHWY3Y8{S)mLaxdI;rT1b!V3`B^DoQGCsj8B
z)DY4}(sHGSYd0pRg+ay8bo2ugJ(HN15846Kghu6)LEjj`QxH8D`d~xq$j(m-*rowD
z5$dMAQSoKAZHUJjRYOI)>oZi#pE251nDHUl7teodmI*(q(4Z&`(`|>t0r%QTJ67UA
z^q`B^0+{8rLP6Fl&Hob41i8cg`(MF_aNQXwMpB=}<0-ZQh%F(8L9GRJXZyJ(y;7P3
zUc~mI4-y9P{CaM~LvCX@XeQR!q*{CC$9IZuun0%T6>fv@vyn?Qg|9}QOG^qL0(lAf
z{S1Pg$WPcmvbIZ%Z)w;=GLrff+|VJMfw#d~XVW?ftBD0gAt4(F^s2T?fADoK)k%Vc
zEn36@d@+%76_kPEH!2biP3_uL17u)AC(u%pL`?A^kI!W=_z<)@NlhnHZIm}mFeY&}
zm{%3}j(Xr#+EYY@=v@7M?$AAZ9pgg%!%ZZDDg7*8Q7kjbD{wGWhK;{Qg>a+&9%Ws6
z@I)ePE^jo0IIFF}E^|vBF(Mp-wP*=SPDk6iU%~X=`aK(GhO~b2h6d&HrHLz?=C81x
zJiYq93gXFfZl^##ye$6&iw>3IIs4iA`Jd5#imdjP)%<n&*xiAg{jZPb-f@Tsx7Z++
zEV($*2*uT6g?mllZVOIly!liyFPa-$n;de-?)<23pjWPfY9BbeBK-B<P=G88qZ#4e
zW2oT<=+237^50_=xpS5*Z?Xi4e}!c~g7F!hWP?&>Uv~^41e@vHj`H+-310wv0hlpW
zVr}jTj<Z}X?v>FBW#!VRw+dU$u(PmocmsiSo_m5({ICN-4l|b#+bcxqrmYD2Xaq);
zgK?#-eObMU0?|PEJyt};zIv9yw;$DRyb{CE7-a2^0<cum(IYXb!JV@rYLy5CQht6S
zhuS(*g=DoWLwp_BUTD%hOc@1<<X$<(ft<3;ZNHGAzvgu4_2N5CnddVsi3yc5?Me|m
zZuddRf4<O!4^i*VE0QJ;?=3y95FFPL0h+*?!%qvD>*|2Scdn+x4ADhIkBD*^i5G1y
z$h-C7A<F|n_6I3DKg?G+;~0v7$JMmG7};AYpiqVBE#lDKZm7OP+QZ_C578kzw{agh
z-nkSDr=)PRjdV<tZbN`CkD#l1E0S9yogxhjxF`sz!#lq^EIKkPa*Ijvy`|OTv#58y
zYF2AZ0vzLN7Ia$@zBAo`)TN)_D#=EchygB8J5IHiv}<|U-PE7ed>lhoS=9)RUD4S@
z@4oy!qK3S}3{3)cl{cmKAtTTWdhSMB#+M;OnCI7z+)f~d)pYQB_ux2K2JR%Gf6KIB
zXXKw_0ij&E1WRlO#V6f7O)eIr6S~BBj7|0n5OS1r=dw;MP#^{%6UWWtjNh5?ukdW8
z3xVt%Y&d|Fd_LPkW9NsUa~Evrt?%u@Jz>MI|KVTJdCo~L%AXavz%an=74eJThw{-M
zyc<}t`>f7vRbeJcZpU-2L9sMBNnlshI6xjyo?k_km;5jA1!Bf8yR89)Fxd7m`r|bS
z_BnnBGsIAQX7_$qxF{&G-4uw?eEk8kRbAj=c;zM>;(6u&;uj`T_q}h-wMRC<2?v-;
z_Y0H)5N`5#)d66VNv<1`hdardFdPK>165jZew`#Q%Xj5GbkMxo?*(cNJk>(!9%osH
zebf~Q#PZil>EMHkR8mS1Xo`3hhx%0s0ifr;6ysvFjfxDy4|Kn_FHr7vkr{GF@7LS-
zF`&5}*{cp}Qvv0&quV7WeS)CU1|&@*axyYU0K|G18k#an!^PzQ3Mc8S)b?66uKWJD
z^Z;&*OiC(v-MEQD%a>N=ZNxNMw0q_BjZ^m6oA|E@tQ8JYgWXz%D3CjMer?q6^}a|S
zoMA^vK|bhGctS#lJmDx5Ao{?RODvFp6?DKx(vZB8%lk4O-4VwHY%$_QL0MRlDE|rD
z+wTwGtG~VyhFgh8>4@@sj;RUwG&Nhj(g1K>87`*T%^E0O2Av^g<GEQmIqiW2Gg$C|
zHf99|T?X)|WDU%q<z`6z?532AjO*#~F3-b`duAgm0n9b;@%=9t85%+ty1h6LwwHI?
z*}LAE59Vu*B7#v>UJ`>sU%Ax?IV<C4_(@Ci_ez|zVcht&_k<n0p<A*)Olv5vh6hDO
zD7Z)T_&Sj}AMGKh4dS+%feh1mYXP~{CtqyLqzT)4%ZIu*^$KY@0pADWxl9W9@y=*T
z@^P<Y))DybgSgQW^73nwwL%G<pdh-~1sb7`P}2*7l2mR1oH&Nj!Vir{pJ9~k1>`N)
z1VH^rrPV(VYvu~i*oeF=w@7tf)+W>gC0)n^tbY%vFv0pjkMd!9-+>hzw3pVCxA_9(
zhgnZEH<B_EeBJ8Yi{?8*#*&=CdKh;lau6{Guchyf)VNi-Y5}h;ejZMXKC_k*)wrJp
zLbY*X;4vvDNKNA?yL(Y;+I%`9O7r#VOBY1fOsnVF7GwLh#jadPNnHc-(vHj0HM@J$
z!C_R+%l+-{3mu@#ygqP25d_d~XI1aCE$GKS^zp7_)(Z*NZ^ifYav2U=-W}k##OyyT
zQ(B2a(95U?e#FhJ^RRicUrIh}#!rE#Z~cJMF~Qe6=*Q}$fDo?tebs}$0oU@i;h47j
z@`lavf<zQ&D|O}9JbbuuvjV!TRGC_^W(}_iQm#sW>xx~bq3O<XXIh~8>nYB@`cq(-
zuL-le=d}G%x8c4#vmxoHvr-~h1ZN6&DLeO!rlug`CYvg7V!_<F?tCBKAhbp{J91f$
zMTdlZnv*ef+-RX;wTS5D?suGh?f1UQb0Mzmo$(R^ZV2)8`;`{n{qivtAoVpnBW4@Z
zhOAmW&={<GcH#YnC*LpV!l$Zbf|SG0Y&}5>a%wjqa7<@=cXjiraI#Z#etCHxVZovx
ztfub)<82c-?|9=gMI8`l!)lKb=d!TC`uCoJxm2hS2+pqqC>$hVv;<p=Y;=6B)rxj6
z0EwC9@#IP+)^z&rY)d&PtA1AdbQCUex^ZK|X`Y01Qd$p`i;sr%g9|SQ=x^jKa2M1Z
zm?|jrk~oZgv)C|xkJrhrrRu;DICPQpgH)PTXG)OxswEP<li!&zA&}@_us^ox%>7&>
zvYW6#s5-W|sL1(fjjn^;He$hgMgKOl)$a1(+PK3c?%`o^`^{l<(D;2Aw3$y_Q$}8s
z^y;j=M2<F3Snz`Q`mwSXn$wdMTpYB2FMrSfV~`~q%`}G;8+S#{yXrYwkh%m#z1^8h
zl#b0*E?yH5)i&gb${ANs9pSc}3k>eJ`=>5AtnZ}rku|D~#*Zhyzfl)KADMI)fTSQ=
z83u35;f>N1@c5n)r_Rn&9sRF2I@!liKJ!KHr`VW2wr`95-)oW1KzKv;u%orrdHw%m
z>@9$*+}ihHLO`XvK~zGzyF^mD1WA$Z?p6_LY3XhRk%kS3gaXptu<3Twv4L-KJnxD3
z^*8^|8D^Z>u%BnGb;osIcdS-4SL-t}N^Owr5yL^ybcliF+xL-i`4iedaSzPA=-GqC
z&8|bS5Z*cbeK44f*=|fiPyK;30P7I9tx6xrX+-snNhrj!+-xTyER-h!j9msT?@D9E
zo-ns5BakD#x%Hs2$!?dGql7E12eiO;WpBi0A$N1BPG61&zHkVKQ3<JwziboH>azN7
z3qZiT+sgtapBQiv?tT4-atiZm`kA!9!&OY?rfH1TACwU@S*+kCV-sL((l;~9-@Ppu
zA1?#HtKeKSXXe^jrTKITV~KW<lG~ZITx6KNdF)*4t+)+5upAFu6-&-*vyilt8oWLf
zE>MDi5dF9H@wkWWvlPL%e{g`0hu0ZJEoL*p&if=NT<hRXCAf+h+@D9qmN5<LKfsj^
z(tHmR+2<xECO~s7xEWpy){Z}iSJUu3P0po*u~}R-6}0Imn%q=3d+vLnON)`tj7ZK1
z>%00BXknvaC@eU0q<<G#GQU|pd2XY2EWomh|3E)EFeDWsiWeXc0jPSBUw-tQQaEQI
zf^Ecgk4>vARU%BjAUBtQfFRDXvks^LFqGg<3bbX9vu{GP+!vti5kUq0UpQq9Tr(Uq
zbf~D?-saxa`;68seVC?SyN^^tb8Hg2feIW7n7(W0{`@|Q(%{xA0giq$M-@3j=D$Kc
z^ll{g%Xr*L)j%aO<LqMT?ZX<79%&TDw4atfdZs{Ok=(wN*8zVXEnl0g)&3b6AA=^Z
zBDJZXIK;#<*I~5nq%M}I2!;Pj-M;#Kk4<Zf9*~nMKPQmgh-Xp(hRd_>8xq!0YLFkY
zZqNl669dHu6+-JvDye956~@6)eN84Edo9<Z{ix$Wq2yxeLX-v%iGb5MTv&<{QyQxF
zfI)A@1=N7+$z0VN&-ibzC3P9^HJ!|j8~UfA{TLPn-N{f$UhVGW8~(?PA$D$p(&-`h
zL_{wAuh#tQF!F`CmT!tXmR~wN^#m8dG#Fy~q$(9TL^Ir5>_YNXL!JZJBiGiC&Dh@A
z7|m=r_au4mSOma6ttjNZNNk87B7%6`UlM%NaiBs3!eL&2KQPxjma){|RJQF@2Q2p1
zB3E+1>$A)4vYFE7T%XnL`DE<5GAhM~D#0wETwQQ0%-PvlvUvi31&dk1fiI3?H`g^R
z%71yBQ({@c{F(WwKBRXpSFTfjT39#!i;HW8nz<v}k;~Jdm4s%p!8N_P)g_ZZAj9L6
zqrBYQ_rX1K-sy4%?oj9nC37DT5|n=liRr^kZu@pRkCm$*&01+3y<wYbU^Im&ozfad
zwn_SOcIvH!0weSOaA4@$VnSfA<_@pvGkhd!=zK;`9#<JEr7FFWF<ATfVvcSS8sqRR
zX*j(h1iG=SeBlG`niTF46thU4TvhrSM$EP<%vjz+iI4x_Ts6^4Fi%2eqQ;iIUPR8%
z$cQGx8_Yj#Lbr7;(l&LytWDvumcH(pzt>Z#R5O@IijYb1mz4vZI_TZ+t}W+em44o}
zXRCgniajDy=qyVlWq;}IIZ&EvM$qF@5dy=?Ednj$U{c~T2IC!FFux$kIl&8O=UKm%
zVB`mZ$*RgS%PDnHTp}^iWd28=B?Uh##rQIRZ|`ih8f6@Q^HHoIC#jc|?t(q4d=>Am
ze?HfWmLqsKQ+mEjh0_(Mu|yCwYPB)<A|T}bi>^>{gDV_J+q_CN$-gnG@E>_ZF$!$`
zugkI9V^c>k>j*yxa?x}q@Apg|^5^|9_;x0D+^>e6Jv+qf>A@J5$7<aj%)`PWMr)Fi
zIb5(*NlO+#_wAK}4+E=_lhZyhrz`3GSE-R^H5T=HHdKa905eQwpj><jM6R0sfqebY
z$i-<G>qlqS6iC)}{xHmoA437r#r?bgKJsrF-1ZOYJv!cKimsZQJ>4n$@qyLm!2(V5
z{sWgU&Au0+z_^vV<vIXaWFAe;ST=dEM(^1;^4h{|icXo2mg<nBD?yl8!_UtzmQHcC
zH$?``1be)x-Q7(0z(gI`P33~l*RH0|kJem#+LElUlEjIeuDT_Q*0~0Kec@yK<}U>`
zE&vn)h)6kXf!YpuvA>YeP4^3uJOfWJc}4m?Wo@a3={!?>YN3k{U2=4+tda16>9pwF
zu&Wct`P@W$TH1<u*X@butF_WcA%~B#uPEMNs;p&n_BAB;mwmT^RQZD(xSaa<M_rk0
zd2cebW~*9he(xR|axv*iVR?}>wMBPx#}y4;`r4JO@6z<dv$-o7^vjS2{$`T)Mo=a^
zM+ZYi2$20(^?D%fsnX2>k6+&@AsMD9!riZ(n9$x;lV3gS<(nV4zHjusO|dB{%kC9}
zV4N&(_jEIMwkNEgA6rn}bcsjgk*J}U`c(c2VJE-qrpA(Pk2U<gv)$z9ppKZQPKWD*
zGE|<D6d4g3ymM;G1scn(4SIM9+Mc?+2Ir}kPh9d6+DJS*OF@z1N8MUh1;gW&q|o?H
z-p&RP8dv{I4qp(84nc5J7Sd8GTtV??uRT2<rJa>t%6L^&*(af6k)_r%rA-#GM<-LD
zdM)|6q3Q6WWbVq1%=X9aXP-Itak<+R^uz5gX)2;>dESR3hx-Cs=pp~MN5B1yE6z}g
zQ<<LD&8+>sagt6H2HjG}Jp+UQoqxD9-nn;#c$qkJLy+jy1<=MY+z)mur`yWxTqa-8
zV&LA~uuC=_9t6~PJg+@2t%c`8N+VN4jvRr4j%hRtRcdY$tquym1{gD4;oBOI_exO#
z|0PqD#mXTM<s`pk(nz#Ra(Xu9dYnuIS%*~KQh$cnHE;u~*_-J{Yg{lieNMf1O~ST>
z{t$Zz3^nah(FLezp;!2D?}Uc<ogrdf)phAJ`zW^@YW^`=wYbXpwY9Pf^UG_6LLc4u
zpDo_4`NJsJ{I=tb$cca9#RW6g1U)<MZOl1Q!IO^Sc&17H`<cA?mUHheMWo^qYyc47
z>P4`B_ZTv-L`@mkGAqd`#wMimX~kvciL3qA?3XgmR0CJxy^;*34lPS|NIgqm7qvu$
z;dsMk{jEmPTMIgWA+H&Q*0V78$cCi2>Fd5zol#uaYfhtIX62>=U-%W@%QOl}Bn`R3
zVvF?d?S;pX3%~1+aQZVN1+|?i5r56jHmVfYOb<iV%g62`5T>z&gU|ZzNo~V!hJub+
zyZ8}Sv;Mj`Y-|F)kKA{<(1ZI8MV&*gO2?|Cw~<}9&t=;>#fT}h)yoox_=`Oi8+)?<
zZj`iJEqGPAV{mFXceS{4U8aT3N2v151#CjR9ETK_rNZjrASjWxe^)#N%3KNYGb$SS
zX%@Fn&gh9h+(L#$jj`n%t}0B*#f(AwPbNGJYPQN4?j%{|Tw~Q~2{#>&6nrZq&`+Rd
zHmvtqy7Mt@R#DJ<AA6kDz*h8<wGlm=_b)eKXEq-)e4?D(Hl1QXIR`y{;pdGiE)6u(
z=@oHL36dDD7!>1={<hDv9e3`e-i}ZTzZm_Je`pR1tTYGI#o~YfOm<-V;<8rLu6)cy
zV(VvyM%RYCO#V)O#BGiz3D^0tQ>Z#5g3(LPSQ_@Y`Uhnbg{wRp(`Sz^Z&P;9HAA!B
z_MH!u2)(|DK=}T*;3pl4nH4~eaP14CC<iXbNnYM!7AKKt@6KH>C^j;oW59$hy?b5o
zQ4r>LTu+U@9L_4HR?=hZnql?Ot><z-W9@_>`i^zFUH|fn{mNu|^5;LJxQMaS8lO2u
zT$PgOZ#*3|&FV!-QM!h}3+?!s+)qY5GvU%M?+b;j$NcJ=Ybh1vvu8I|b{;5R9(>MO
zJ0WxwGg^8ajuP9`Kkb@f;UaL2XPa2Av#8b2Ez(l(zQ(fO3ptsjk>c8UFf029<a0r9
zldn*o1eT}Q3M@>fZ#Yq{+3Ae2R;J^mm`xDl;9K5B#A~rPg+zmxGQ8t<mwu|(V)#zu
z>CAKf36W3cmD1*UN7737?1SeEJ2LjN`is}6h36vXvUg)6)^m=>W&<qwH<eUh)PEk_
z(6H6X8S{lj?rl*1jW_MgV4Zv4uIASumt+1{tM0P{?zgoO3OpYOv@)eqg!KlfIkr`4
zI-OUXewqAg-}gRc<GlBC;kp3>Iv#y!1R8OqxnS66AZ%XiU75?b<r*mNFkg(!mjoTW
z-q?K6<32y?4O2G(>OBdR2T_V2hLnVRR@R1Tii9f9A9^M;gg|VQ$=4F&L~P4E8!lpO
zQMm<^;XezaunyR8k<OS_<LZ36;R5KhE4fUUo&qO$*9!Nr?z;=K9!)PQrxyp7)%+BN
ztG}wP#F<@EbgwMod;I1fri0G+a0TKz?Qv1>zsG;H5xm2S>u@_a?$F-^ATOy90Hh~?
zp96JZ_lceKISqPEho4!d|0qsvT!QBcrx7v*J77d7Kd@i)DP8wr+IQ-FUOFKkq#okC
zKAR)d*)iizPJg-IBC5H!K?lnJ5+p?<Ym>6uPv@rnoP~(r;=kqb*AF{cd$rLpvd?km
zQb&@i6aKXDF=L5mc!}AC_7rb#vy(uN9YilYU^~(_!6k7{WOdGiQA}iiI5?P%LXh5H
z(nk&cPRU*BY$NWB|Bz+)^|v*O%yMmt_6&P^^X|S9sl<(YRk|}Rqj|*p_OuOLoQhZI
zkym@7jKZLT8G9^`a^2<bwc|MyhMrv|{wIIAl%C+QvRTdd%%)JUB6}7k<eu5bOBq;R
zO01Ejj3((+PV6jg=)I?9dS`p{E+17#uLcbnB}AWV_;7IIzr5c%4f172<ru)g+{?ZZ
zi%yrMzHa_{`u=N4KMy-|#QF#a30@rit7m)%I3<y(*e0X6e%hOo>Q`}ZILs_e+sQps
z&^{bg2s*|q2VW2l<L-W#N#KcMZd&|6w5;zUouB`aoSBk>IXkegqa%-S9?Lh}yrqX}
zCPDslu{^=Bz_TmO>SD}dRaMj3_^l8)OHa)9T5X$H`S->oH}{iE%)3jNg@(04j>@Uj
zsG!UcEcWM2sq!=1d0Y2K0Z$Jb3RbQoyoPqw5W+y_4KzClGn2OLBN&1JQV?`PuC7~_
z)K#+fAUC3QrBv{h!-7_r64ww*N(CFVNG}M<%`1jXUq-ZrD28q3JcQGue^i=qdDmB~
zk!z)B<xs35%oLWovEhIk;~KpquN0Bor)Vo|CjPF}21i{vR%G*>i>ov0T~+EP!j6qk
zXixcp+Ew>yF8s``u6W=<Ew=5daW9SWzG3isu3~j0f&Rm_K-`n|Gb04hD$i<phgPeM
zmqL*6h~SpwN+@AepLc~%y^|a9E$z3E8b%^oq&!ZLumRP2RUF0+dmvxA!>S}ZU`c<G
zl6uY+`Qzb6@qUR_1@?%MB}ebzhM5HC@rG&1U~X$tX|!kpCu8-)@Qlk~%i2mYqB!^g
zQvdQpp~{Mf4Z}3SrB2=iYb=%Y*D7UBo>ZZ!T&DDwY}oQdw?womS8*Z385x<0TGl#G
zTz($o!vxShri!iHHTaZ{eiX<FNxGMQl+SSLc)u1oWEa7(8oR$Xjk}cL>n=7%d#y<0
z**E|5AgFE4L)fk8n+pqT$Uz%2Xlv6O!sFc)J#0Vhs8X$V${UPwt7+udS>`6)ofF8{
zT4Ri7oYYSsm;6qDFxOx!+;<Rr2YJtaw0#$f(1Y_x=*M#ucq=A8&sM?@9=B%G%Y*1t
zZnFFj9Zw(HxN;F5gbE!tJDE+K3@G|P{d!AKh9FUK&VPQdWreDzVR#X%DfP+frGVul
zn+yYsVuQTC+!bWNN4_1DxNLPv1w`bw-6-nlx@LoBnylT1KRB&J?c=p+nDf`ofc?e8
z$t&kY8a(^P1m8#pJ-oGGG|+8amrI-b@pSsSbV`yOB<-7cGYe)rLhfzC*9gw9N3IoX
z8t+91eHh-tZraP3IGG1@*(VGdb}XeyM08wy9*`}N1tI_T&0)Ion(@W@MtPUTjLeP|
z1QvyTR(G`5J?C=YksBiPd&*aUTY|NdaZ;P%*=%_BQ6NkfwWDP9r;FraovakF#`VH;
z;fatkCm%w==5_NP6kZ+!SgDU7^A||(UI!hsJ^B2|Hw7J0oCA%~jp+dyK9GkJU8|1u
zUs;-6DJpKS`XMpfL84D}#_Z3WEfgR&uT&&v#ojr8Z<+06I|@bh4v*CwCA~Cy#`z&h
z_&E7L4pXw9yR<b2xzjpSyEY{E-N<)vQr7RyE!}=T<@zE)(ZU^V8EjHtQoQQ8fPa8R
zhW2W6Y(Lt4VYhTy40^BgaXLga!jbpIY~#A1M4}Gf$0|oi{ubVbbb^-<<?e8g)whC9
zyUwTs0WxAt=>Dzfg_YMSqJ#svih3_*Upsk*b8+>&j&1VgnPM}q*sn}K{Sfu^h2BQ-
zNX+^sJ5JrhK5T3-tiVxD-oxh{s@RA<D1APGIO`u_4Q&a4C4av9wjEb1njLYW%Xeu4
z-@7@Tw;)Y)(-LZwV79>L6Eu?Q9c@xDlt(-1zl`P?a!>8vS@q|xcf#aF#u0ASR$_YB
z%h#qoPmJW14A0M&a)kS0waSb13X0dsnIq*y)emi{T2wpMpK6k8jf1JNEO(aLN@agI
zjb8%x=uDsSx>#q2>S!6mYB%wkdBP`Ixxi|e%r(QNy;2&s@t(_2#5)LEX*yBa5PBz7
zvEU>0YFaCd^&-8bi(pJ?xBy9!c$iopa?PB<`5=7UrT&4NQuHSAf$9P7sd163A!KmJ
z*GDMdTJ%i)#jMVAjhX}Ru=ai;jr~oQoCr_XFaOw`y1l49vul^7;336wEPF>Gy2A@%
zb_4a}G=AUfT!E@+6}DaKaxr^%NAaICTz+ObryoF_NCecHQbhfWesXjJ+FT~tevUcg
zo^*QnE@+13?iV%(#5D9fMY+0B7?Uf+V&dt`snO>&@a+hU`#6d1XN!ox>|V>&UadbX
z1eUXeZGY|Fq|+My#*q;`+K-WcdN=4gFQS2i#}7VF436eD6^CX#F5U-F;ZkF%?f%sm
zdqQ6U=h3@JggJ3EMA`+HY`fOnpGz3CqwwATdWw5wvVp@@=R(uvUSxUDk?g}4kbD)}
zEWsz_)x}z~*L@z1VaTrvK<DYfTEWGH(VtTV8b927g=6flNIG6X?ss1miMo<H2yrlM
z8nHolbF=PVjH6sCHK;|pyA+c!;IIy+1{h{V<Tx9a>ImFVSPa1d&(LG8pRt%AuTsrZ
z>o7jJvH%;HbQ5d{iCk(#{0B;-+=`Y0MfJNsY;ui!J_7IC%Gp=!U^5m7JmDm5YH-0$
z*`(;$vE8U?u$K2-i!x>$fqxvG1AP0zW8|fy&Z_?A{A;EJHnEzMOSeRnxI6Yp$li$g
zr|ZN4REQvwb@=mbf7RxHfqCG60NpM}-)HlbN$=`?2X*AmTs50ixl<CZox9n9ctiLb
zfm4vMIUv83vO_%QLZ&A1rws`yZg>g(0#0I5pBn+;Hp0KC5W5$kR`Vw54iCBdk*QJm
z^{37=48Qw`2!;P<B=}UceK;K93g;QulKy5YjQgBYzXqUmDnCR(Sov4zy?`hA&FtY;
zK*l@M(88}O)c~O&AmIERLIDPAi{$w7O1Ci#a5D%9&N}e3^q(z{ekFs#<#oOeD~nx%
z_M%DsP}yARp$CW8^|sdAtg;U|UwjF`7K0D$s-1%IM&M81f#E>KV8{$fV$~a`mF`LX
zTbU2<>d(A@fX~F+!4MkCUv~c!%%COR(*1D&XB<;vVUZ<-#r$D*G4gf2=BTQ9KnsJr
zXVu{hu2R64iIa;IaQ_bg(Tw0lBuMO3>+6FK*2=eM>h;IVD7QuRUm$h|xqxzX3h=|w
ztxv@w;{FEz9{USO<ezGSIGa5*%tgd}TC&@!Q*D)BpnL=??oDx@@xOMqv^1tu@Su}4
ze~famUD;x4PF=1qv>GX7N6lw<?~*z)sz08Ac<7NigkY4#@Jcg^DovpTwmGV6_CvP7
zx3C7A?2!7=qYwH58il417gC#n%*er8L;AT;1O!ViImyQwuNv)MFR1M9TWr)nNJR)l
zTyh1~;5Rn)_AXJi?Q|I&o}DhTxAbkiENx>+GpkuDg5s=md$PyaI43q{>OHE7*>3(0
zCi!ZWy}$VfF@?lTJ=ua(m=fsRhqjZZ@HW|35&JRHWIe7l!^EZ%mNo)iZt65mG!isR
zM~+EpLMOwEZAV0piMt?>Be%q<z`yr300v6z{&2y!#xXY*gJt~BR>Z=~vxVBzMS?AI
z{4S^4`?EWB2}Fj9HHW9FjoZ;TaA!-LoRr$zM>2gamX^DQ0VSF==XAQ@QVIXrrgato
zzx}AS^>xPN<%S^sibTU_5;iTdDV+VJ`05t3-c2VdaHX2CmXfNaWk#iWm}~sZp@omQ
zmw=cQTb2x+a=N1Gc&b2=?bJ+GtifG;e>iNxx;$d!Ys{{LKr3l^9Y*Ngds=uE$o`d8
z`r00m#rRqXXJqR;7E>Pu+mvDQ+SW4R9xuNH1V<1mgVt9w#k6zcV*R$amt(uw6Te-y
zwntdDw$Ji+kZcJ%?H?Jr=8WjEti8Ham%0j-sQpHr@jlHr)sFE{tmWwpXaXxBL@{qb
z?gqIQR$#Q#n$)oH&&;PNg+#yZea;fEf6r>MWXiXYEO+g{-^E?53O<NUs@EMUGwY@5
z>0#>b(e00~F>G0W^{R42kXxBdQ3>k<RYVHjsBPJr3G-~jtRm(cL#WWu%dW?fd@i?)
zAU>o7{(Z?u_kD!~y9kI2Q871p?dSMdS(A$+>IS+D3pWP&`;|3=((|@hG-JZU-SP3x
z-_5#=KS>%bsXg30^7ayPvBbjtojN?5%|6YK`7q(NGnG_r{mp^7ua@?VC_Y737S}*c
zjeNI0F4p-zhvPjIAKh9%@$#5GJ_<e$q}Dad*KRo}85OCHj<Xvpf=mmI_hLwBB*4n~
zm*QE1kczM<QQsTCclyaIrJ-S&$aWpNS${kIG<Gr30UxJ(bH~DlZNy>D&wIN}UjW>Z
z^`36U!xv-a5`Lgqg-zWKPN@Qn(fF)9R9&WCV7~FV7y+V4!{5<+c-!~lFv5PmS*6jH
zC2!BMI}X?Tm|}C*n^}MT^hN-Dq(I8o0OIjfb@1%j$7B^|Rwsx?D<o*u=PWM*8SUAr
zEYj=GnKLVcL(Ve%h)Br5)ctZhFGg8H{n0L=VPazE7Q}vU?5GY~zWWZc6g#iEy+*x!
z#8xibu1m2SmWoRKY$kj8R$^_Wa+Pj>`pJg$Rc5WZtSW0RdY{J%*>|3&>$Ea2G+?0E
zL+ZjoJD=3*L9kw`|4%n=J0z#JL7&K__^49Pmv!#*X}7)a;0BVF`=Bg2=-nEC2IFyR
zsu@PZ!A$h!g}Ld$x~ugVjVrywQ*&aMySE?xU#Az%W!{X}T00?Q@(>8G%cZa<b?Eli
zcnb^9k^Wuiic6Bu9?UTeZxRo2P1+&sE-tW-k4ehcZ_Dtg95VO*(0j;Pww>DWQ&!6|
zsBWmHHYcguwy3kjbKiAq%r@_d>cfJP>o1H*JO&kFqZz~o9!y-$wG`Ery+b@IN)?-B
zHclQpI~Msg9z~9xN_`=3-iUSvP-%xbCCbS6Q!d`R4Q@y@4*0`bzpAcf@I(mTC6eNc
zSzh)oP`V6*b>?_V3lCN1NQ=-)_=wr7cIxK1F~u+Hl86rgLyKYyfhcO8bkSe?{#+2<
zhdjm><)0yO+F<-x7@YjgLote|3dT7%h4Sgm0?3JCe91>ohLp+K+0pnuH~BclCbPNO
z{8#{T>{bOn-gSI&vbJdw*C?#xwR@Xh`~{dJ_h$@M+90)Y-lOV4Onr@W@*w63$E=;G
z*W{UJ*h{|mOTrw_Fs<$1+y*A>Ubt4-mxcmucVLSZlDR(^vqj?%XinYebf2ga4BysU
zNGG#XUb*XO+ptBe#}i2~G{Z315y~*8UyPBI6o!Y7W~JtjI2*6H+rP16@tB0|q)7S|
zAI9&@u(oR$RPKR=F%0@>aBy_q(|mF$kF-<N<0bdhD-{213j@ez2QixK?#%8fOdXvF
z9&az7U&}D-Afwow8cqU}jHNr;i1s`X?YN&uBUS&tjG0SIm6n2wE~xl0naAILq;T$&
zQ41}$kt?om%u2I!Ma5vrv!+pC+-v8~-y5)|@4IoU*RSlBU!K!OksZa^En<GQIQL&4
zAa^eGIhi%6U!(ezx?on8JpdoolHs*bF&{6ZQ0skOX|QZ>*i5x)RtEKS^oXQ@#ofr$
zM-qC4#Ai!*GR#ADjBgQFSfdisg}&aw0hLzYK&{dZI#{q%oq4Qu!g#lyPAT<K10hU;
zZ0e(mLTzfmyUq1U(8?Y!bTL?B;O|S&|BeK70R3n!oV|M$ZfwKduOGTrvm*9rHHE_%
zva5v3EP1os5`5JrC?O}qi)=>U{bftfyG%RJezsd{eS!_WhU29NXq^_rEMAgorolqL
z^k^@xJj?eSn;;Q7w)~+-c>k4dn#EA=a{^lHbZgvi(1sA_Q**2uy)rS#htSm`5=+Y*
zDKXFCoQ<+Vq%oc1!2JN{xAa%71ieP$^*uRxYmp8}w@|~!IY7l+CoKz2oo<}{ditVJ
z8Ru-^u4eOiBh%3*mp8zK-_J$oxlu-Pwpd_e>GI!-9X&O{^U*N7j#_Q}3Z}XvFLg^O
z;&k>5xNB*9^i5~iao%qQqhC(_tJK(H?v4CTZr(tNP*NkDX!J=V#{CtDYY{f_s$l2X
zt5nV;zq228Qi0;Ya=nJV0&O#|#8rO-O%`+s5CXFYIuv<9*FF*$cySkis#`#_<C=`^
zu0;O^aSKE!ubyOurGWA|@K2KLFPMhZKm(pqHcj<En2?<WOh8|RpD_u!+XWYyrXl|E
z8wk%3|3ZehOduN@q(HzmPhNb~;Taw-IQNK$nD{U#8BJ9MZH?Q1b9rp^L|~~&)5f%c
zYk8H3^-o+zoSTJu)F;&QxiSg>!{H|HZ#X<mnL(zjjObZg&aY8jA(x*%#5%vcg2&JM
zH3pBHl5($$c?7DzU<oy#ViEiUG;Z6n{UbVm)hJ~<UBe{SQfS_Paq2i<8~lDQ`*h(d
z1BpQ>^sUQXQZ^}leEPnLstw<qrc#5ZJgxHiZ=KTgwAG7dACk`v%bR$IcJB+}2ZfKH
znDR1NU%$r2DwqPUH`yi8y6+vX@{i;rKr_0N{rv)eqt;f-fLcNxCuj7NZ2?L0qpf+m
zA@x7;n7XH}yO-oiacU|)2M3P@^%TG7zDsYiYCd|mv)n6w6dZ2wcCDJOWqwsO%dyK@
z|3A<dz-Idb&T7+D^-t)V9=1{Y!>`p<WM%SK$BKq4Jw2s7J-2qQT+-wmz^wGw*+}2x
zCAYuKGusjqrRQw66N)#ZdI=@FmGu2ohsV@Ai&2XDFv@KQAYzj<Q5n*Me)s+;(W#)Q
zGx&j}e4-iib56n}18dd8eT%{WLdBnc62+&dgBcCvXJ=CI0$Gs_JJl-P>#L&s{!&@f
zOHs<-`$W^<4+LDTX~E>Zjv;&L-a0o>Zx+#%p)W<$m!=7)w(wJyg)Fh8rvFCXbB)V7
zv!Uw@AI=D9ZX{(!*N>Sd<8Lw)Ts+ptBtrCBQ+}5*=y&^^CcqSoCZjeAT%Dx<0c8@6
zl)|S;QFt$T#^2+;f$f{Bzy++w!gAx_qy0&-f;JQO8;_@qj<;7dg2=cFry-v6b#)fS
zyif^xT>#SR&=E8Vy%Nf5HRL!s=M-hvAGr1O89cJ)(l-`zA<67nS6f$R%Q`ul4I#J{
znw=;U*7sj7RI`?$x#%HX&FVHjJ;iCw!})9=5|6&hKecKTNo3f3sz4!&I6wB0dZ0v2
zO)ZhNcflc|{B^qcPtU2$+4WU(9i>~BM$&`tSqF+)bwjr?Ad0wzno}!1E!keIujsE#
zPHAUuz4`1VL^h4ilUh4Z)o88Nhp#N0D<@!J|HP9tfhcsDeUn)dK(m>(RGt{FpNID}
z5$5t`lbI8#vg|XOVLQ{rR<*2#8sNHc(hBqbthuIdLqU<{fpd|aS|bw^>m6YQ1cYh!
zqs5?9*FBViO_N>!Dl~&L-?<}-$Ibsoq{LuPj?{k{(AHvI0&^D0a)JA-WkQ#1PKGXF
zn=|#ZnUDRmNw&TO63(E7wE5zO!i+jto+>$<)nP9Z5t<QTZXkW!bi!N~Wd3YI5g2V{
zH(e8Uy!a{H@csq;f8(*}DEpIwEd_va&hB<D@^{R`^TMc~e%FxmmD)FSKZ|+oZ7;KQ
zo+GbzMl!$tQE;)F^e#b@u#U_B;t1FS^FANix5MvzC8h6+waAY5$DU-Vl!Ew4O?^J^
zr@m5R*M5Mm#&RZ*Nfk|E%Edy|(DXcj4J13`ZPgPEM>Tbf$f|At6)!Xj=A`n@vP9x(
zW+&5e!TG^gbbC{?-Gm&sn&%Cup(OfTdd((|M>l?&Wz@s9r5I;Mj9N?S_lk5(L>omt
zJe!S&uS-uq4=RZb!}+LsxuDVMlsub{aG^^?gcS^ALnuc71-OI{KC&(kY3l5DF^7*I
zT7Z#VTdb>7m^>f`TO|!pB(8E?LJi$A0+36;=sSD0VdTiDesdjtr46Jr2gM4KFJQ!|
z>;r$Yj%P3MJF5nQ5B@shXo_I<nJ8JT4wuG_E4XQ^Ls+QF&G{8?I-!h-bTp*IjhGGP
zwo-V*0u*{xEK_*AnT0M+lei2A0a^u5(O{=qL7&A$4kTkUS9Hafv8;7TWPD<_n%&+W
zm^@-A4o>jG+3N<I^@wC!KIvm|ro@l6L@ZJHoyplG>WGyd=!ABtXXli(9Q1w8ljjQe
z{n>6&g@S%1ISBC>4P(Q)kw4f{0U!~W<`WbEY>o6Jvr{#a0h(2pR8oi+Ng6Am>D8Ou
z!e^xsbr>N0W>o4>F5#lDqf@$E=jd6<GsStYttVdf9^KP}3M&bXEyr7l!d4^d4MBVp
zulQA&sW3W*;>6M2RvGX18v44L0C(lK)7{lQB?fE_e5Ysp$?OkJn{;7V>922i7LK1U
zU*Z_@K$KHN$YmX~Ma1Xyp=b36((k)bld23}o>l9A&?o;0^WDRskHZ_ZRKN2$LKOx-
zz9SwQunT(GuyRXML#AK!WTs?hGRolSd@F~44d*Yswd|9OP+Q2J1EhCmQ}Q7yL;@8>
zK_gE&U6@7LC~vmd52B4r&L;AmZSjsrkSa+OU4rT;Y56*$b?MI$*7i_JB#kQ^T=18%
zbiLhuqGg_551g6HQut;1%AGD3J2beJJ7nd_S<bsss1)6$2p#-BSY-+?=!qu|a2V$w
zWN|-Px%Dh-ePuBFzhXC>8J2E7g7OQjTm|<`@A!@EpI{pD8#JjW4VfigASn6pf$VBT
zBwFsLYSN_`86#u|MAR6*?m<CoKBdgGIJh-Yp^Mjp@cLsav8&2GCKd+t-wA70ofM+^
z&C=f%mu(55Y0z6N9Z@gGQM~<)p+>S(y>}zSFCh28rPAQPzcCjMVMu}5e|#Mk@wkY<
z+1%k_ky{4Sr?&tVJ$-5=-N|?3W<|ALVE*oHe8Q0Ksl+wb(U2hr;-M~~U?F#g9V5YS
zxVv8gEtn10nl;mbsY!`|Lv<7OAeB|z;|FuZ|4fsesuBSoBZnbr3>vL&B4Rv;2$WVp
zNg@16Nxg=xt`hZSw0s5;@P25HxPBb30lW?wj@Fd?IP;fGIYyua1{(DKAJp1;da?Pz
zD&@)ib*eV!oKu|!1Ty!5;}LNFv<ew%x~kl{Pq<N$z}#^psb6NFdf@Rt!m?LfbGFlN
zD{I6@+n&^7yWRo`@5%P$*?N-&kJ#?q!A$0{K3z$?&Xj2efa3k!d$5%S?_p{hwLWKB
zB3>>HOOewN{bdGCrk&x3RaJvK)AN?@a+Hyv3vql6fbTdfB<D51hL;_0<zi9WNF@S)
zouXLg>o5P%k^;oJvG=(bf3aYapX?w~gAV`_%@Ydspb^RT*8C-3RoJ$v!k!Igy$r*r
z=}#rtl6bbHruw?4=Cj%Y0IaAGR*(hjRsZf$ps3652n|<End1|@!&9-^d$GDkoFs}~
zDg{Wo;aktj=Y~qg_wey`0m(6N0~e*(qKMLj{KV1zK^J}+@Yq1*1-Qs%3o53re!W#a
zE^YNzmtbeC#lQI7xSTq*zmJ65v13E;z%`uF@6IgJ9-ypd16|M#Jq1G!8Ru&iRHT7;
zzAD*-zecyc-IPnFTWRKcR*(d0R}cC*pyp&#Yul4AXZr>7WtNbKdd?ns-4WsR+~1z8
zOawS*D!-Rf@S`bdJ|>Ah$?ZOm4s8EFQJ^99UnuZuw10ZAO4s#m1~l<U+Qr7tMfJZ}
zxu~pKS%6i@$$5XAt)`^?#mxO-sbI*0|D+9WFb_5@T@BrI?#10;Lm%B3d)3q|pdoE)
z-MHF%dWZkOJKy^Z<lHHnv4$?qHb-tast1Y3`busa14r&i;n_6$IGXzK<^GoMK7f`B
zjTLnBq2qEsiKL`<{4r7r>e~N6w)N@i5iAV+J!gRv6BcUdzfho>)uum;P3RXnH~PWw
z7Y@{WxI3i42By#k-V0CkKgIV~J4}s<`C)hoxJ%}wsy0B~7Tx8sP)RHcc0oMUTerKa
zsx|4-@Cu2_ft6EfXs$nRU{s^+0!<J!Sy2-aY*`Jj2CEYp$u;$h0jh*wPTQX>B(|z#
z{$caJVXyy_$G-8MZ(|z~Iyzhnu7mCj{D-LuIRrOV62ilngYE8rU240#6yN4+FNTcH
zkiIpt@%(Jzp5%?A*BLdHU!1?_;*b^aNMT$9TgYs9H42VM>SucfM#~%jLtNJhKtbIB
zVAJnnr=rnqMHtrpBsp%0hq8Msr=ROnf_OyY1%|W1yK>oIC=e(04-{y4$h68tuKD(n
zy7cy>bD{fQ1;NSz&%pHHX($=M)gG|8?Cb-eK$i1YO3_Pscg>Pppz_=$ND^lQzij*e
zuJiN{imT?)F)oaB*4X7a+g$L=lk+2cc}3sEUFFZh-#L=(mk2k|!yE1(gIFi<nd=!J
zaar4TVY2xyb?BEuzdymB0+)skR=u`j6J^Aark1_05KB0p4FI$I_HGoB>hNB}hSRJ!
zWr4ZPP|m`}&t0uewK<NLgS16HQTEF}^)-}8?AL!`h9xwYe~=fqtRu8`6$`lL+S|vv
zx=xfp1d{X3D*Y<saJx`dIpp4){}N#X(+TRWtS0~<WEQguepSUJ5RvENA`aQ?<t-UT
z)z2fedOAH9X;xg$K6jf&Wr^@Y{nB$WsMe*WwW4>ZS_v!1Rg3qzUK3@nhMg}7TTczV
znSJj6+^@Rfd<Ufl^Y(9I?JdRKSs(29W;0rtKN!{)f*A4g{1Uf;T<J#p*$qI^xXtBq
z?6s(e1GVQ|CK@$`9Y(!Nd_qYryTNcytH*<+U(A}4QnUL^T_BVm4_Df{g`%jW@w20{
z*`!B=4El&iJ=G@F8`#_1zqM4GM6_mz`hbviaXE6IdtT#W`Ih4W&G{;e_D}i-@(``2
zQ)p&i+BBt@pH=ItmgM8cCHE1~P{>9PTOVgT+ild{e;q+K+7`i>dn2|QqbnIUF3J_V
zbhOn;+?MR$3|?jt>*v{yZC7k>F9GZ=1$Gg2DZ&I``Sj5r{Eh^{gBqlqT*NR!<wN2D
zefgEE)VWBaLfF2I`Tm2$NHApiw?fxCn_VYgu&Zf;1R{C@vCsQe*;5}yT^qUxgXjk;
z`$bL|3q;p7%UeW43%DY}wph(_2O!K3x@5-UF{AYgfu%>cTV*U(GVaWkl+iM*T-^g}
z$my1&`nq}l{9%vpS|Qf|;=rT$zi?n@yx9wg(F1@b8MytAMx!ywMkcG7g3DB0XaT==
zg`&deOsH&`-^dH%?4p@a$FG1O>8fjwhCv+CFmQjt1$Y3~Kj`g0Ad(C9zR2Y34J^o4
zljm6IUta@*=um>+o8FYF^*u8n#N;N8vrq7B>f=HPG1JSXJhU@fVZ`#ns?MAcQw`U1
zMz66AKd8|kZ#Ei2r<fd{=BsybD7~t!$LEOU|JtTr&)<WCuveX9Xh=GYDQjcUrG@4$
zznB7#3YEj|4&>8-#k=3>&l@l)*MS!(?qjW^UnB<ntXc{%VQ`<N>%pJ+QUAbe*7OhX
z^h6ooUdud5tDyXy#QnizfdhD=DPyi0j0@N=<s$SvgzDd+{Vyzc0S2O>3@5&I7ySG#
z<o*wm`>Ha!1Z@DpQ`lm3Bn?JqN=!_0G)&Q8&a1JXQ2X`oFyMSP!fk>TR=dK@-y^J>
zrw2O`^J%yuh<n^*1@F{#-tcgO2)|NsndEr438Wf#vWj1e-+5AQw!r=iMuOk^B3^Fx
zAL^8S=7t1V>o=UNcNSu%-oyZ^jS~?l9(Y6O{U*e@y?(>0RJ>?4bj2P6jPHqtNDALx
z-SyR4dkidr%XaGP(N^tn1UpSuD==S+(j->k3kf(4ihuMZ`j6(w&=1pUYn9J&8cTcW
zOj*pME4})hlK>k!s(<`iaE>*g^sY})slJfeK;|`=rCQVY9ZT{zK|NqaM`~;vLBay?
zHc`|kO%(F7_NtH?nU^oELl^5bCMTH`MG&=vvT4#JI^|Wz+e5f`ZN276>{S`YAfATB
zYx(_1)~eq<d=1TtrUvpjxt#e788?)tuKrBmG@|cv{|<D&8b+|UmQ}k~ZQf5aRcLt4
z>zKhlm%Sbhaz#UmQjqslgH4R|Ymy2Im{Y;L;AT(#O0zgi=`D~$ygVQ8<$u1|h93}g
zV~Mmz>(2+@7HT|yeqlXU%BnR2kdyNYL43lofOn0yR+nN_Ct7aef%+kym&yL}WTiQ{
zrHH;-F-6EFK8Z=dHZwgq5SUgyR4J7AOd%72EuvGb%)&CGAxr^2n1^oMJI~S<3r=YP
zL>2To9l?E&PQt!mWZXe6un%hb<P#JKJ@rLj%~F3t{@n@x;>{jAYWn%%UMKyc9@V&J
zvMp{BnjH~ukjj50G=*uh5HI`7p^1mvp7SCnRX51G0om&TEv@Uu5C)uq{O06e6>btB
zs2(l-naun@QPU8JVf9oT5_Hf+2>)OF*)QDmn1Kit_@HKtP%-mfuV^HTtIf3^a+JK=
zwNotDxr`&3k7ofq?(C3Kuf`=7oRkM!TEYL{{MoQR@}uqoa}j$Oe1FAo@BzUkXnlJk
z$b_2k3pM?3md~CsB;+H=u*(G!{K=u!DVzO~>XgjVehC%7ds<_68DKy=Z+t5FU6sss
zz|Q61;F*$FBOw#7O7!vz+&8yDWGP<KVDpGT)^(jdfZ}!B>)J=z*-z!K#E<lDDZ{&d
z%GcewZ3?)V8e%Jc`xX&5kR90vol;;X+KSXZ%?;)#(X!td-8yj@eX5-Cbi6?+Ej<sd
z2<azp(1J<*zq6d(?55rCM7*(%c14~Y<^kDg&hG4#W-}1ZD_r*hRKiqki~6!h(W{Mp
zEgKX9SQu4GRVyLMkoEt`xLDI*V}ylmDP4!n5YJ`{y4&$t3}o!@hhkA^w?-3v+l4lN
z4JNE*5NKWbxD@Nb))1(ATNW{6kJ|3><c(*mD(>NVzr`+8ym}{u0qH>yXp<t|X)_{i
zc|%38G{C?7k_Rl@4yge*zBvutNyPhKwAwnS)FxBRbpulh#f-NQ**LX+O!v1_=`!}i
zW{`rSRWpL}5jTn%7yK^Jze&<k3plSbMRVg^ybHfIFSYEK4#Do&1%!A)@;U29U@B|u
zO6v*B(*t;OBM1at+L)?q@faQe=Ypu|I*1!-mAU>rIjLUOL~qh;g|V2F@uQ0R)G;8@
ze1l-pw?Q&?B4e{3oeh$gLK$kCP!@P!dK}{~<R#BzFk8-;XQ2An;dh*hws%+zTGP6n
zDv+<(*u6BMO`YUXtE4d5{_ZfuTV7$DzJ#|?z^h!BHyuQkWq0rUUc$y8>75TBLZdpl
zm1WmA-t)bXzfGn)hx~AYGot3zoe2wRl0cI6Xc=<B_E88ShS(Hw0WQ9W{5FFsp3s#9
z3cbhzxMc^am1dT$opxp+8Eb=-_Kw`Z%4W+PZ=>_r;>gDo@6cjz=i=s!j+dh!bzRk5
ztuvZVPEys^zzBSZVt%6sjIt+jo~56h+xwW&yF2d#b(9HFD4&(33M7ZrJ#?JR3o(Ua
z>inqZjTk{Ct!`pVQHcjHRTndpfwyHtXkvWLOLi5As8ON!;rd*(5`svk2Qo52dFz{8
zIuF8Z{(9mhiahtF&fXPANhij9ihLgk{HrvUYFgS_6tz9^+CH_^yM6K}>VholiJ01F
zp(}QNes-oSPJK<MK?*%)Rg;q^lK1tAS$#yb-mCxdO8Ol=mC2LljYjqIodZ)1d~Ehf
zv=PKA!YRV(!$sWSQ!cG8Ll}vr8cM~uC<-h42;c(tWa$H_R3L$fq2n(k)bJs9RfR2Y
z*>ZDh<mE?B)*r;@=M!C!2*Y@Fd@t#`$2*>r^TrVnDC}&Wmo{IqV6m${iQ{CC;t~+F
z=}RR&gpIdc5i@fpAz&zO%=GF>ja9qZawHwLzLud^sPkLt_Ld9&<brD)vr^#<uQ5^T
zYfny2Tz7%Rt6QqBFu8XV5)vlF&)OSWqK5Jr)uq&jsvY#EfKZ^^)_%{%`XaZ+4Oz^0
zlA4iA3l?W4+!1%$T~t*FeFmXdC>w2V$2CMK*XL&c+0pOcoT5jo@Ese&HHvChXy{HR
z8iD<b04UFg_anF5X+jNOYV>8tl|bO69HoHU`}9_w!&Cq<dUUXs>6y)HtDnnQ0*fgm
zKt{Tgf&(pwCl%gxY5DScs=DPKN_Hw1ylo(^Z+27C)wLH;e5Mj6L$k9PJ?h%e+q&@C
zp0$T2y6ry#$Z(+Kc&MGTIP@6j!HXSK`^PXk9QP7@&1YkuWnNwu{m?bT6np%v-sUE4
z%40$@T>~z>S<Xqw_s~E*!Hz}gW=ln1kFuN9>0gzR+L?f<=kEKx_>COZp{rJ8`G_26
z6RTopY7@Guo_d7ll{Yk8JviR@!URr1FU8Az?X0S*1p54&)ZxQ+_SI8)@(L=%7RM|P
zQZKj5%S7n&E_D}&4)$>H!JrwGPiz6}$!w0yNu&$XI${SoNu~!T-9w(%C>)em5m}G)
zkrXJo(r#glhw}Nczt;HjI~l^s4q$zo@1g<`CuT6n49nUb%<+~HL_7?DVq&O3Q1N4u
z18`7WIJooV1B%Q|ezd}CAKym|t*xue&Rc1ln``T8)>^=P+aMFQOSy4t06CP8T^vTH
zAZvvb%z)s+(O2I(f`|~(%73zc=V!$iN)-qq!Y|nc&lL)Hwqpm7@g;koh!>I%vjaGJ
zgg;Ln;|Wp&iJ%R3K9-BzmqIff_Fv-R!#seTll&jWe0Wdt=<slF^1$qIs|GVQ83x$H
z_YsKtnI3_xzj>us&)!+t*c6rf%TEfp**x7Y@|yF%9@(ia$C2;6K<oI5Mpc-~z43Nx
z3iIm9YOxbY&b<!#gqay1i#bTr1J+g@ordbl1Gqe!ZRgz*zZVHm+UwLLPg@OIacPNs
zHMRXVDD<m|q1hLX->B+LP~p<I>5FsJ)&7?FlWzu*v_>>mo#AM@y2^l?XWMJos%eLN
zEw=yh&_>^ju$2{|ND5C@H?hvni0aYNek~1Q_;PGsSM&0Vm+zqwJ7%7kxwrU@ek*6O
zh)UM7$A^dRE~z;Yu%&X`S@mNaDQ_nyy6z^mDj9*2$^##rDNPp)TNii#^qBo#%i(7H
zr5MTM>stSmw2Ta)mun;rKz`DM)XfiaFlj`mr$puCNFR`IE-Vn+q<*#7_|V$QM))8~
zhm2UZ;6Tge?v5ukq@kf<RlkwZ$(ddtWDXfkRQuBcy{D&ox_8+={H#AZ;Ve+7gJ~v{
z9br<o9T9F~+1PT8mxNCqu#KGdwDPU(+ZL$0+n%58Xz}J~2{DVlSHWy!3|c-W!`!Xk
zEz*M|n~YrdseJA65#~>s!K2(a41zXYF?R?3rHZC~PeiCfTue<(hpv5yGC2LOJ`NYG
zsVu18t!gtJf>l?O@9!t<%p1N=^!dX;1Qj{YwI#@1x14Xfb;s82T#pnb3V?2#=7L)H
z%-R!na$8l^#g4Go8ba$Ohle94bJMhHC53s8n)j#8gy=gst;c#ze)!y_bUWTAKSs~d
z;Q}7$X4Zr&qY(@$lxYJ05(G8nV-SszVPs`pmT9rp>gbL?(RpG#xV4$w^@fwHJVp3S
zhLp6(g!e1pcePkLsbp9pf)U;q4b(2%cggDLr1y3==u!}cKpw5Fo3P{tZ?H+g=n#IV
z`Xx9Wk7?QLyBqgJosUK>XBNL0$ziF2?)$jKm9@3~?|FTF2L!4Ni(m#kieL#)<D#OV
zVV>jHPeK-Qz^yQ<Kf}DzoQ_+}`%_>TXee5*fE1J!^)Lq6Z0DyW#B_oN@!Cf@_j7FR
zL?zu=W9Y9;PYKG9Bt8qqcWwqT_6;kSTifsHRaunL9>Ju2;}aQiHO19tP1X)MIE>+6
zL&~5lk?#9C5fP(@AI$oE-nqpf{^ZSD$aS*B7V-6h(knqMFfY_N-xe2-Qc|uFvFgc0
zM*3!|TW+fk537@t6{DgyfA6YyoT}P;Cu+dS?tQKKweuCy|FI8EMEmk=q@>R4{(~n{
zG{u=OtmL}#=jISYX@yZkhJ6PMT025<Xx$w;NKPMq_P;Cd*o<(e>=y9h`s#@+?`=Qu
zF#0ZI*j&MNfrl?J=4V+_w<z$yj!<W<kuDgTco|>Y@iK*DoX7hrahsYyV4cxm^0lL&
z(ixwa%b<$H)A-!-b-0#rO6rVId?&X4MsBF>olmiE{ttlR(I12!+`h#PB^i7$IL?<D
zez0uJkR49#uPD_~`4D?~m-D$We-Hx^;r-Yr@17z6i>;H-iuyGgB{Vwk<413)C?@iB
zo9u*Miy$7ICn6Q9@?lmL96z+**<LZJa5iU`cyz!H^{Bi{rIXIyW>B7qN9XAB_S1=}
z6bxmHx-iVCzWR;fU$A`_Gf%B9p<yr=_w}o8aY;rk8kO~Wgv@zO>9!xP4)o5ATmei}
zZAc{%SN@P$QsnRlli2CpJb}M|vP|N~j~<>W3hqD!64+XF)A=lpw>6LB<T-Qz>LRco
z8FC?HYmqA5eiH+`^I!OER%UF)ev>ghLWZ*^bliZ9V9x?Ilz8LTJ8FixF9;ph>@-Dp
z41ySxcBSaPwZ8s_+troe^kE;ij~-U5dJOG&41;Y?ib>zszpdIF^9hINvY;Oh``U3W
zxOgtJx?)(rJId)g84zEty}P=$_Usw!+Dw??_bMCdlr&%L=S$!qET(S|n7kn<_>gE;
z!c$e@6wNB}s27n);_oI)Xf(k9TYj*g@O;Pvv7@qUNg(a%Yba%}IP88J0u1?YHw<(U
zq+q3=Jkhk;{E#MOXlZ3t5!H%spEEjcpUrT;Z|YN4E|5yCar2`idq?e1vTSKR3hcJ`
zc;c@J+3dl4x&nHPbaAS>!(<2zn<|Zc@jRYskCyW5kW{!_!neM>f`Z+7!`N7^iu_et
z&k8+om^#eZ{w$1)%uJ2*&F#?`apC3vPiQ7r`Uf<_*8dMQ+kR;DKTFYn2OZast7KSV
zWb>ECV1fp5mo923JXaDoaXs7#B~0=BM4eV`Do@I`5FO1S^P|;5t|Th-s;bc89Ujv+
z1TooAsA-wxNrP@$VOd#{*7t=D#)d|*#DY(s-^SAO3Os{U#VX&VS`DfbL8?_VhLHow
zP4e{`T$$M$8PWj&Tzq_Wv}_^|H(8RiqbZcaO&vZm;RiT6>+kF}wz>~yv_^&qq8Jyh
zj6RE>1v%e9(%&p06mw;Z!z(Wp3!pEdvT$u?SqwQ1NVw4K*3^iQP=7(5K4H{ONctO+
zP6~fZ_S4)Y2^i<$_W1?dq*lV`2ZD7kNu=R<t!6P;1Z+chaK!!w;_%z5_a+|d!@!Vh
zKE(X8@HFK7B^_92)G$-f7nMv8qaXMDKsfQbCG=#frM$s&=+r$!JAHc^gDxkRP8jG+
zN3G&TYp?&3`q^z&apG~BP`!jTV*HlfASg<MI1EWYrp%)$4Qg=kpTMN;cBuO|A!>M$
zy~E%~e<2a;<-w<JnI68TaU*ZKqOP*9d&=!DcKtm~<`l$S*dz!BOf?=E&{55%AgK3$
z|DdkzA#ZG4?eCBDFy<D+926Rwkbv#&jZJ@H5Xc8s;U(13wA%C_4Z%wshWZwlU@uM6
zTXb`~%y*dzuZi_(F!U&1RSjPowqHZtkobfEp&^{Hqo-YI{|0*DWqLgRLP8swSc_lk
zy=WZB`sXG|%PGDmWLx^Un+B4G1sQ(HpQaI87dpx9h23{kcc1$)85EPbV_~kzU##+N
z&u2TLA-pG%4uP8D<==dZP)wl*d7nRg&(OC*)Vg7xkf5A3GBmNe?DIjTp$JWlziZvm
zPAoOGz@@8_=P2Km)wA7P9x(<LRIzEoy-0V_i5VY{{n}2RGTr?hM`O$WuJ32&SX7s`
z-fMZ~R|?ej8Jvz?Ck2m?bmI;x-@0S_9KtfrQO~~npI@{rNs#PkRqy$oP7km3!8ee6
zb6YsAyW59^?4zUZ;N?7naT;tGykS^~R2}_BuDiwg2cRX<3{8{^*bRz8*_ADW`Z7Aj
z)cE_fxtq?Sca((XKv(!Yo*T^ZTsF>=rAP?X!cUd1j=NUdT71@!i5d<^ThfJ;_dkg4
z8o2I3EMui;LZQ_?v-4GH&+E5%h8o}?U7r?9dFa2AEnrPNLzno(JP&?kIB1xKWQna$
zc$t$%tx--CT=BF3xtR}Uz1Ta%_a*H|jvkCttIV4N<IXx~@!V2`%odg*uL9+IplYvp
zqAE}EWMp{*u_)1c%%N&8ka~#|3D_poOvCwL?@HyoW&WP;#nzD+QCdT(SxY>qsg(*@
zSC1#$E61u$*b^yC?O16!jOvh}kEMn;ZhEOzqA^hu1v`>GJ$L)I9|TdSpzEWB!M}*{
zNRNJjMRgy0D;QhU<ISc7;EWCr-#`r%6!n(t%aoWSd_-(g+7kSZa@Sxs@<|d9PY|y|
z>%A3r2~Lv#2y!^42-u!9)S4YYO|E(0E=t1^w&6#CvD8enpW1y}Ecl(yQz3>72D?3Z
zB2M(|)+5se3O+{#VhKHqSj!pAb+${{0@)N&^7z*2)ANUqQ~1MO>#{xMf~il}mm2M^
z%AAUn#9`;5LbD}=H}G<EYhKMQk}?I)+iB`#Ud;7k2aG<{y*ORM_VOE7WwV^F;i=Hd
zK+}>KA}lW(qinV@CSWsl?d=T)vswf!)#5DSMG`wRXHOpvxdJQ(M%?|((LwBF?vU?e
z=Q_}#ryZE|Owaw=6-#NNZz4cTE5^*qs&Gt;hVy`cZK-}&N)XY7G%Ymo`E&kCE4lr&
z<$kicS?BdPTaPmE-ja6++0Kgnf7P93Sd?4%?h!<!l@JsJk?xRgk&y0`4iN#71`!ZZ
zVkqebX(gqkhENz%L_$THp_>7u<E#O<wb}o3u5+C)=OZIB@4U0#weIJ>f6rQso^NoN
zKi6b?SJW-C<KpML9ksmIb*Zw(b;S<4+4@w>fSg9E_VF?fXMo^5j?O+|hd<(ZcQ}o`
z-QIEto0i5?xFSk*^;d%cK`&@+k)1KnI_}<!=(jy_RR=5TbqwoneRH4h>o~7^7%mB0
zPy%D-Ehi+lV7E1VW^sP1!3Q6JL$Z$^-S_fh({6-7{8Jy7aDR&>SHMw&77yFp8Kf_@
zEV^#p){!XT5X)-$eO?Mk?+DNFn~m3C9d55YfX!kcT?$qP{jHSXB)h|5%DU(j0?U<`
z5@4?I;B&vk?B}6tVb#7edidIItVi2BX3u77v~^CoMg>}LCA40@d8DOKDTN=FS@h+D
zrV13ZK^w+gx&D#oXuqqzG!+c4(Qd?dC+mq8BkJm$MJmYV<fKAolYG)`kEl|?;|X7?
z<&(7Q)(j)#Xz&$h(|c%RgtDqo?Q}5w218z#?sWTca1h`#GMbs+x=OzPG(?@0_391&
z?JUztRrv|jIj2L2Qksaeu&TE9SE#kN)PrRj*qgR7?}G*Jw9FTB+4H=P)3<7C(R4aL
z(+@Ekj-Pverd9en-Wa|7ASmWrlloE2DQ7hjcx-O$6Q`DVY+2`NX+{Qs4I-6l#*16L
zW|ECYoWI)YPiKTZu)cr)TSd_ok=hZ{BzUdESEggHq$S|HMbvqJ=I&aToosZX6{FBe
zHvL`JNzA;-(c3ZkN=>ibF89>>A6*!4E6Y!F&z$1(m)XD7=8iXh?My}jy6P<rGc$Rs
zxi58fXAT*vQ=%?N`LpV(Q9Z@VdayOT<wnyJ9UU!uw5|(wWZ0=tN=;EWIDo3X#PMfn
zpHx$CAs5;8fR03>5ahen%C+jp?SfL(+{-3@<0!A_L5{H1NqrmSceJ?Zq2pN6eTkUZ
z5XyhM+_z3nS=n=?*5G<MMDL0=ZHet$M>UqxyTb~^&y>>|Kc;w6Hxmoe1~m}sGZ?z~
z!p&@#&P7D+&$=$<>Qt>64#;BkeddY8w76@?=e`!k#iiC}^DHA$^Il(1|6KSfHB-mk
zwPL>EGL5<JtiDvGw0bg+3yhy&*I)~2;F)#JBoxP22M?SQnDe|szP!3ik!3^qS)u+~
z{cc`>;T{6V;zi5H{c!RmQq+Phw<(9iv}Mie!}P&yky29=VX>B1TtnBwx8w7``0|xA
z93fivgyUi0$vXth&&}oFu#1Vs%CM<^iH@xZCa5!OF??AH31Uu2(Bn5S6XTCMFDggo
zM2ar9=RDv~6$q+!GfWmtb*S~$V%Nc@vmvc{bv9cbk?xIEUJDKLGBC5Wf~~om>Ty!v
zYl`320%gdPwtJ~wj|@CKX40h`Szm{Jozpb*lft%eC~i4}`V|x<6pkOcB0R>d=r5z#
zsbeWI#cqD;&7t>#HwArp!%l1VDpzl~+y24$d*{j7ehg5?QSgX}l&7WLHy#U0N+Qcq
z+zXwZphZ*PE!t-Ga4o!-5BAX_X9!k-rdSV;A3H@H@(3=(g5j6yb|qb%@Q8uKW;<_>
z>d3FpX|HNK4Qg1dmKq>>bYVPuVEKG1TuvBRmyPe$)zKn-^%-+}nqGb06e+v6yiCt`
z2Jdh-kI_BbqXG9r@UYj-lX-uymF4UIT3$Xe^+9v<phRsd-Q3fI%i$0sF*PUADsM-o
zid^e`i9(1BVy(rt!)tZRaV^U7avwV6ji4$H_(D8bn_dZAr=D0a*-T#vlF+#Mt}h}&
zG0H4iU|rDv0v^Y8;t5l!fC!<=q}>t*M*W&il0?4P2;P@r`Qocy2}LxXJGd3G1(x$A
z=q0$=HE^8Z%xT2Ll2h>Qz*!<)Z-;i+0lrdJpdq@deSRgz*S0D8y|1`*)e!AX<D^ja
z!%lny?x+`Pv~$dYN5oxp^jk_<#Rm+@8k{3`PF(aj7gl{UYh>(lD!)RH2jK(}yJnJi
zT&yw{M+%?d4z6K}XrLy;Qc{XDT99Zl3m)dMt~y<h2@RKfr(RW_nkrOvij7$kLGMOh
z*n{6br7~V`yP~kOaHN3It3*-A93)h1{OReOz=oIs@m1v>*CwUui$r!B_8r~|@xt*I
zOb1Ixcgnfu=Oy9vFT5FRa4z|`Rgi?ir?6G-dc9T7CZ>_D0uxO^2e7}7t5ZqqSNzf&
zw>Q4yGE4~XkjlZD!|Jos;kt68Q^@MHDK)7x4sne(YPNDUEHlUH0sEcCKA1Rvi8yCT
zS-Er3ti46~>y08NY38VNwnEXDG((y)+yoe#$}~^PQ**H}5v|^VM|vLzbUP#Pi3F-G
z)@@22Iy156R$f&_FcmO;d6AdNmF8AUJGb0D$hge2f^;0gQj(YDY?1T`LpP)C>I;dC
zHpd%`kgc~6K|yiRpbkpamx14sC*SXI?C2Ykwif!xEDn={<0s|xoE+EN#mg*#QnU+E
zG$~f1jQ$o!oW-Pv-PFmjgi$urAVWDghz}?K{Gq;t0Yb)FHZs8zq?+ee+fyA}K_%zq
zt$?Wt>@6&8yVr$YBNT4=X?zrOgC!+<Qjirv$+U_0b`dE(Rkb#@u#Q3V>L8NQ;>SC~
zp!Exo$Po^V(jC<C&*33@2|}<A2a;rbuL&iD-Ys_H$Xj>nz2aB>L18_aaN9%=8B=UC
z|BNa4?x%7XbXD3ra?*@AMl4hOstUjpSy;|SsL3f9(l`}uF?CP^tQIw?rebN&Im#f5
z=eHH~YC{fJ@|j~hRHp^#jBZl+W+ox!4_9WaOV5#ZrrQpd&U3xkF0Jo*jRkh8`*Hrd
zr5gt8zNk_<1JO+EA3Wyr#Dcsg+?oSgUc0@1HQ*|?zclA)8n?YgH*8#=+rWWETm(*!
zLNH0%JFs&JDCM}3hbKW8{TV6Q>)%vL^eqn;wXF!63dWp$u@Uug$-+9W+NPmho5gi~
z-I19&)^QoeZm!3h(fP5Co?x!};rInGNV4<X=D=x0My<nVUxNDktHyuU-@Tep;p+*!
zlXVtvIXUAmtwMf(vcgpB1iOjv8_?y#ofE9vuBhc^PBb|wNgyJa$~p|J-FE4FtmOpq
z*`Aj<C7h9uXV(p8G~^JOsh?5`_O8wkCuS!t2PMve{}+q@IdPf(J8_W*iHoN(1#Lh`
z2rJR}1E{^Q=TLQdU6CD%9q+DOkGfz*?9$z7Ca(A8)<JEVpwn5<<o$OVe7B@TVLDw3
zA+T#`wYRt8AJGep72i8cArBZZ7UQ^DH-}VNTy~4OY2OTe)R%g_Gu;Uo{I72-T;G;^
zs@<vaNv&JNkB-mNabIgHg);+x0CESB!cnEQM<Aom<+;~NZL~0`EVe{v5Tfh8f9?0@
z%B+vcP`L~YC`5A{g--lLU|43ads)$JFmcoJxv!ewZ0cSN+sx{ni6wQqM;wmCq@BS8
zu)R+=iik#eJMjxTK-jTQkl}DCOMhyrE5HO034T0aICNkJ<cHQ76r0|_-^?g|Yr{Ap
z>K_~|V2USg&A$`5*Gsy*2G<nKPaF3Za@p6wG;6qX@)Ysq5r%UZ_ohD!RH&}ma|CU0
z*8J{p7(@rrqa^yAevBCM-Y`K&T2nT+>AX(OJN4QN%r56SW<Syo6(|9;fh^ET_ttX2
zv3F4A9lhjFqF(HWV+QoQY0~3Atb;RM?Tu2pyc1^)!=1KfmwwhASj2Yi-n)5OeEXf$
z7Dor|<!lpy@Zq<M7@GB>G-2b!{H*u$F4V)z{iMIwdF3BuI1UTPUs}3sZ$QCd=zEzK
z^3@JR0R1Zc0m-+eM4p&x_DPLTuxPPxraqx$oum=Ta`gDdrYPdoOCW;D87y$vBb_hP
zU<c%nUy6bEQ5Gq)Jf<XuHcn$eeVN;1JMFLDO`Bz7a=!wd3^_!}m)gB**@=4lXn*&*
z<yUtt!u8xWsit6`U$tO+^}1vOQ{b-BA+NkED~x#^JmXUy_&v>SM)`r=SV&7v1&SgG
z`xM3Yn}30y-Hb{4Ge|LuVldQQSknfzdJa%Kdlp$tbiy%~TrPTjG0|^cytZdD+~wc7
zGa>rQkd>0qnbQCrd~zQ+`ZcE0@}=9=@ZlrM<eB<v8f$~*M&VJ}p_1SzdgV)bgl!qg
zU?RR7Ly*pwi!<G5zH}uAZjsE=wa@b9pxBkV0|ZPsMC>T4KSL=-7AQKQ$2q^BW=8Rt
zrSY$OBdZNnGICFxDXpvp*2{__)JKxqT1oAFQ0o24jnAp>-@bx6X)^Y1blOLktoESU
zh-Y3k>Cg7;+{1MGv{)#at;hI@*~3gON$EB^icW}(Tq9(HVLxGO7T}8$M7hREIzu6H
zM)Xo8^(thG&(_kVoP2`VGTOpOM*k*nuG&=i2dBpq)wC>+Wc2%Z54YSBy|?^5S1Q8y
zr{@x6UyVAiyGG}gpq+dHO%41r=hQJ!XwFRxWzaXYpt_zlMEIQn_YqT8v4MjI<a3Ri
z#`EH_XM^vbJJYDLHQQa=*n%n|!Idk3bCSDBph{>cD<7qZ*j*x0fN?FqYxop}2P$cF
zP)VDf5=!2C{;Ih-W_PzDD%q?hyD24i=3q;C*lQ1~*c?jvDYYM@%C|EY=JR^#6st`R
zb}bP32AD%8_v9>Jz_aoTt9Cv?-EU+hAbij$7yG+g7i43i_TN-ia(w$rrt?7!I++Rr
z;gH<3px6@o-Zw%>uT<)vpQ{&AF75gL`09VSOy^EM;zN@3P)fU%1+qA?Bdrli4}nYd
zw}=crN6-?_&Ar~)sd{Xys~ZnOg1CN4E=kgap39kT6?Wdm(8n#|)Lq{`V|z<oN=ZIg
zDz!YiZS}3r^YevTQ4<iBjo+1Bo}OYg3+v>KjO9b8=%GI~Z02gP!fi`QISdUgvV@R{
z9bV~XxsR_}Mo({<KGY@-N=62GdHEVhXj;hfa+kVVoq+9t>!P;A=v|9pMNIPNvTQF3
z(?F(m#o_>&Iwv2Oj~7yHZBsKdgTry1AFg8aDw&iyeHa^yg@l8E2RpNOhgtCXrZ7U%
zzyNG~Xh2+56-m@BkNJ)A@Zn|~y?g;UQ2rUYD@42UrU^<(cmUdV8<sch+_gRNoSuA>
z#POo-sNou;JLS1xH)tvO*b%F(LTr*k4^UV=J_CahD=M<)DAkgG&Unh)bjr<M<&acm
zGX$Atk=+w9Ee&CE9`+etjsMQiG_=Kvh!}2phnJnPu)BIS^>DEMKxpY5nW&f9J-M0p
zeS8n{Y1hp<GLJx`kI<Ctjl07AK+T=^1I6DX4~$11_@#dqfW%yluel55%m-~|GIj2D
zR>w1)PnGab7TNRbohoj<Nm=9#$d1Kxuoe)=Za8gsuMB7wJ51HaOqTx{n6=?^-?tj-
zQqGnbji{%kwz6`M;c~Xup;@~dOfJ%V9`)fA={$u<^|8eMBR;@mO1F=4wEwGALp&6c
z+YY_z;qwvx5ku)~KRzDq)3Ba%VeO?x^b5Fnb{Z^U-w5MrSJ6&_B26q>HBehdai5dL
z8=;G&J}ntr^ZhFsYj%s?TB->9#HoL~#CxCt#4~eMEIO-%$H~~_!p~%^icT&YnT&B3
z#(6S|wEUl?VsHABD;95bCAnWMTHEzGE!WWufH!x$j_gbsxE@$U&x7@MqkeWT{eq?n
zM|q}sWRi`2HTy1irOw6lNj$b3ALT_h0i#K-DgM|+BfdEB*v%zW+xXZtN*Fqv;|1C3
zeoQPWs!Yx&B?QHl#OqsP{L#&t;)VQL<h2j=OL(FhMfPk2GJ}-=6_EAZ>)t7H;}xiK
zWeQo60Rfp~K#jgviTAMaX%p4YGGvW4^r8rXrzcuFzp|(w*B%JCU%zsTkq8<xpbPRR
zrIL$(j?T`vi_EWMUI+O+H1{zzx3XR^4BvW4N0a!<;)WiTzSSTSM&V7d&!O@0T2D07
zAfnSk;2Xf0L+SV27mE|VeCc)g^6`G;<CjDAxo*7QJO!dJeTKm%w%pD$WEZRvpO!cd
zUZCZB&O`-4l{Fb>1&RG2S{TnpGqvX(EtmS+hzT535Jfh<q&r8|Zii$&7z{_~dJUl4
zp;H|zF*CRcA|BBqQ)<C6GF;d&vU2XPkXZX%k_JLx8(Ghy+c?}?tEK}*!N!~0<;O*V
z@Xa4Z!7@?NiyG70s9->zLV|z@s;#zllA)H2?*UsFb%dGUkD!cMGCJl#mV(mq4U_UG
z@wbsvmQqKb1)5f!X_b7S>M^<`nfk=^C%HtPZkvOQ>^x~_424535fvf7P%rXL&KE4M
zZKQg4w!jQqnu$P#SVp!u3Pw*$ARM;1CLf^wT8Dq-CI1&PdaXKxR#B_i-O=jqbh%8}
zVG2Bw>kE-v5crmB;`3T#z7(?3MT)wu&OXEZTBPsJK6fF8%~YX8*o93*#QEqjAt@=4
zhljm;?uQEgLN6ut+`?ju8Qbm!B>RZceR3gQLv(l6%zct#mE09<GI9Uo@~hg~(t@Q(
zF#qgtSa`qspbsmZV8{qfmJ4Ef0P}c1sTWNfx?L1$5s^qNM8cX7q)m9xaLPmrR<&(f
z=_wYus`if5i;bY&3fG#SSUraDg6>t%8dKO^ibWYq+iz;k>(`kkg7|lWcF!g3yB2ih
zB%^p4gQ@j_b9#04T|+A0H43gdu=GqZ=I4)31`3@y7Qa%QQXW=d0|+Su<l-9_U+w(l
z$>nNi{DuL~_y5k#9!wiYcWgHHHtOB-E4CfH(w#<B&JD}Js`QRyQSlldikKFNd^az7
zDhBrhUvhaC@CZdJp&!j(n#HAw;0ZmL5UO~;1FXQxN*flr5diYDY*KHlV%$TZByJXO
zhhW1xK9w%4)DtPw=gy8nOh^&2yy=l=z06LLBik3##owF=J7%+)Cv%lrP=s$)3##8K
z^VknC$@fP{cG{Ybr;0Lbn&c=i&KncXne%2frn!0Ee3W_hmsI>G_Yy})Af9gf{G^^x
zVAw+@Nshj+(NG9{?U%a=$3P4PLOhfcCh(=|rCHN^^yf-APvw`M0VxWUgowuo3qu^_
zwS})wH-sca=$aP+NABgbILA*a9wqih!>j3k@@ze-+~VRa!0B1c^cOQ08|g>I25;@}
zu0S0_JZC@z+}^&{aO+=y6Yi`y{;x%CnP>&rQ($UbpBZi6GgDSt$b)yb#@*6RUmPmW
zd-^pQ13936mu|^1##w^j*E&{G`0aOr%4-f)WMr<P+)1U3%W`WIv%G6a_L*Rj)fezH
z!aL?xCF}xEPF7RymuSBsl9d2CV!p4EJ<@F-$_soUX_|$9h@*x2UW%#wli|_LS=K##
z;4atBC!>#-68fRzPE2_Cqs4ygVv<;lW7gy*#C30zq@%-?TwqOK<GVyW9EiWNcZy0H
zgmXupd$~bD*pVjz0@)qR)sR*qVq#M@d$E&A=AJkPGP3+3CRbGuYADRC!%V2sQn1D$
z|Ci4gK7N(r9sRP|_9q`CLG^HVBWZqqCK@$4j&tH2nEZim5>DqmcYH}g>NDp^1}?xO
zS$JtJf`_<)|Bs%SbT2#2>>NhDMuKyCOf!1V+&*!#ghz^7dLhWSxeA7cWFfOlb6~gG
z9htK3f$G>IA0-x+$5Hc2G;7)FsT>de0Sk5I@Y(Ni!0fx?Gpz+)N?E#%yp2W(#v0f9
zxA!%_gR{9}*lK$pxalwonw)m;k$5WJ)RbacCgeR@juCmx<OSHyBM`~V7|N+IvK8LD
ze}B9w`M_9J)x`NblJN<T3AIliVx3m+Wq$!4HAj<qGQKZb+G1U{oIjj<Su35kpY+q9
zfbJrIpN3WvQc`wDJZdEG6PE+sMof(F*n!6wqg(o5noa*Ho{+~T?oLGIT}F}WA;`13
z^IZU1tQuE-oL?|3t_-29brW|IH!!#nuX1<Vc@n?v44y!BJ@--@J|J`#7g{xD@2o-0
zX;@jifSP8-E3NQ+1FCCVMVXeIjc{R(D>9Ey>V=FmSAPc`!SW_BB($=x21P`KbNU~W
z$UXVT<fXy?nB<;JrYPV{c$tEPnW!jfS%?SihJ*m~8p^iMKUfpN>%<3oa#wt5Fzh1j
zDTFp99M`FOQm?zijr^2Nte9;2fkZ_`Qfz;C?1xN6LrgR-5eJcD@Ax&lx;3E~c#q#0
zzf<16xv{a)m9%qVi3V-VR_-06PqxU$##~@3KO&uH@mnitZmt%f^6peVylSzt|2SFB
zEDYS$fln$t9IH}FA48&|f{wYS&<B*CgH2X{%){MJ@P6Yl$AEkjAz>pYnDB1mye^hO
znz8W+!6_3e%;&YwLrmPI_BaCOD~!Yb7H(0yv@t@xdg(-|4(j*&>1OCPUM(<BZ3(OB
z<xGmv<;|Fi3hL~6&8<{0nSGsk+5@UYe5O(BKwKNqZtId18g!Q*uuP&aGK(JiwxdNy
zdVs{iD@rN1e}zv?71WFSi<Yz-GIMfT`Tau!kFGS1{a671`kL;*m~&Jh@Xn4zHNYM?
ze_waGYK<Fmlxa7`XgQ$`O_IAltW5foX;KXg&w_u}wbtom(#jz%xKLQOq@kx_FO&|C
zPK?P<BN=H9Nep5d697Itx$@?EzZp*^(^P))XO<no6RZha``GC!xa?WEMfzXY1cr_;
zuhcqI&e%{cQ;uo7Nqb8NJIlxg+O165y%`*Ia5-#5>~=F#in{+6X`Oc%MqXvjF^#<P
ze9vOcajwhpM5iqRPU=gM5naOVR@w+c>d*HLMALN;HftG9OiS&=MMZ=oe&i=I?VQu^
zpmUJ~5&qfq5I?`%zCKPN*E$KNz7cAw=4m7-bdTmQP$)C-N_v#!=oK&DZqax6tN16>
zd7@3!X+V=Q$ezdFQw_K<DT41bxyzLPE1p6L9NfHE%bMb_OLtH3^H`A-QudeIYVN{0
z>PQ@(<!#1<dX=WCLNdB9;?zq9Y<9*|hf7nMNPkaSD@CjC$^&jqI&C7sn^2ABV?Bem
z#KkooaG5cGybxd4fMd<tuI-Su5s4H6$;P0pSM!#i$N9qvaGnwz>2zM6gcIp|%J4$}
zZF}Bv1{0X>w}Z{-yE=s`svt~%8Rn0lI`%qj`>NR()iT(_tMBppbkBzm)`NpWZS~!b
z<2mdVC`kSs#SF$L2oi@O$kwxk1wKZ3a{JP2_-rr3z#&@2p5mgbSf_m_u2f9CME<5l
zMkL=+3{BSLAn1YbPH6H>R`=&Pa&fn906xp6Y+_+BQs4d$O3tD8OO0*PQTREneJS{^
zPjn#FzWND1fw8PbpPQc_B3YfE*4>&E#7sEz@CHVn{#!~z1=tm?VB5hj=aoszxwCLp
z8x*O`@z=0tZ~|vij~?^!vFP!3oTWIajQ*Ms{Nd>DkhpUBYgd;PW%(K=3ioS|2u!(0
zn~Vkm6ooa24#MVK4jKW3I{WSWmJwv8I{>ec)bFCV74^V}0NkK4tP^DU&-L>O&{aU&
z(KhrIOR6+HEO48k-z+O`;Qt?V)<070C-U^8g90#2uB0<V#-M+&$vDYl<LtK17#_n2
z$`6$9UKDtzfj}P{96<d9JBFp8P+|`OoE_x^XQu+Zc}|XZR|+3CR<@~6jj?eVTxtR@
zAoVX7j3Zaz2Ymh)>1}4_99)`SNHIqR+BH%i-CqeL%qgl9wC%|x(<+3LM27D|(ipFn
zaK6MQV*dm?R`7pKV99Wi-wBD=j~;76p|g*rrRS-`9@j38j5pG!7qZ|BpYYrAx(1Hx
zY3}QH1zB%XSF8nJO;poz5nB@z15s&&_s0)V-dx%j@1kNJurlsYvERqqKXLN@eovf-
z2}V&|-$O$k%Q(T*eqFc1pJ4#deyWDJVVew=fu9+zGHUkLqoVAjafMHoaLPbv8+{LY
zFc}}o`V{h0CI2O;dVu;_{FWyf+54^3J6N)sO`reS6s<_hH5kxSZ(25$=}~@I255r6
z(fm*SA+e}BR&!cd2K&Y7@NxQ7mU0^^2;QBNObPS#sJc<t(TS)M%OAk?v7jO+D3kNy
zQh^4x9GhW|$n6;rvX>GgxTGH>Tm~>jqM76k++&)eC!esKo9%_tOwlaSz<Lv*GicYS
zanYQdRw@Y^IkAsMJnUlcp}`3v4$v?z07r~_9~13Gh$MM~a?IPkJ@#;cpV~<|>0cmu
ztw{I~qpkQ)V)|cmZ%Y~!%1A`UQ21~chQy_bB<Pe6Re^-^ztEkD)Td5b_V!Zh>ewx0
zo4%lM{QMa>LRY2r2E3B>QD09T?TBgjw1B77)q25-yZ!jSW2?`c4Hz_?J<>KBR&wtN
zzKkCc0vf5fW)$+pK+RE_90~iyLPR@T^?@A)H8q4kunHZUVeSBB7k&PZsCpLwTkX!M
z&k-S{(+^9_6&(k~bz|zb4oG-X@WcRs!>sK!>?-*ai;y{dK|b~(V^6+!1h}AZAL*Yo
zIq@%=oKDTfZ<15Md^ZS5&YU*tk?Jamh35v>^rs^UOqBp8hB!U7lEBG#@i&SbgIIDN
zTyLB(eGZTAJs|CDz!z3&L-nQld@ux#c3fLSu1bKT{?i_t5O!I7K0SR4U@k$Ef!>!z
zpObk$!v%b_=9A#fxty=%6|QW0++QPh-Ij{h9HcU6f^AX`>G$Y&6dy5ecGxvst%hu!
zP)C?zc#pVmSc;l^iY)Hqj${)hX|OG&<tQufHwBUum~=@ut7w*2XG{#ef4;!bVF@Uk
zD;|A--3KeIj}N<p!vh*u`dzn-I``SF!7%ZY27|w5Riw;X>v#qy4Np}KR(({v9Y-yb
z0&L*~2*i?6v<-gq0q8=5vY_yH$;)28h4pO1x7HHJE#O31+OpX+=Bv(8TN?i*w)QBK
z<ji1sG@6_v6H^fJDr5EDOP3lSb2_wKTg3tfRK>Sn4Ec%f0*1;j-IV}Z+D@+m^4zAn
zognb6R@Hw?EDfkr+$^gJzm8z%E?Ob^v0xxU8*fF0wf21ZQu`jN+sTABLX<)LKKkvk
zyCI_iZ5P&TA4N9=+%pVuoXuxAT_{eVHKKQL>X`m!ZZ!`y*$x&==tl_k=i0H#B#C&b
z*bQxGZ+Q6XMI`GK$JE3W$)zH=CMtpaMMMJaD)J&o04BP=md#@LxMgk1wPXDp!lq5G
zJ&Tkzu|fr^Di1PUn~`<{0OUOrf?4C%J^?!<cYp>yNkYsjNe9zRy@HO>K22e=F!LDV
zqFQc-k?!CVm*CH-OB*ziJxTjbTe(-&%18uv<`s=fQb{J2o7|EGSV&=E7oh#%Z-{s#
zzm2{cf1;tGu|4LCJv%XIN0}BT_X%~x^qyRf#k@mg$Qp6S@c#Z#d;3(92d0^NFt5-a
zg&|)6xO3o&+Z9;9yotM5di@MOC<}!1;<@ajcz!r!B`t&#=tM?LVKYulxDZ$F69eS0
zVaFyZwRG<~t%~7c@ICV>E2)BUlKv>LU4zm~JMH}b4+fOS9dAjMpc(7p4K<xc14<W0
z`QkT#0B^qt%M2C9!1+S=9r^SjO{|nM7h8J(pZj8jDaAb$xlI4l77Mj;J8{L#W1Ol_
zdh=rZ>j9y4+|&mTU>D~m0Nsb~FIiTxkbkKMdSbmTEhSHmwi2%O#@;~z$#Ug82}Foe
zmQ-{RTx5KAS8<az^rai&54Yye@1f=bF&X4mapcM?jNr=bdyA%w1Kme*DH!`8p+ffP
zrE@+fpd%pr)u88!`Uakr{20=+ZOX35l99_`$cf5(zeH+u%^dJpkb$?hxYgJ88~$>+
zvQ`2T1$3rL4pvQ>VwS9=Srx5r5k0I^y2Mcb@FQ3QPRy6kKi2>;BKGsVK=BtU)}Hi6
zA!tO>>8YgG75B43`W?7UsV`Pa{7hrru8jQ?#$L2Nw;giIs6$06iQ_ai^l~PupEZn3
zOsR+n_10;<VgLIUyap;P6odbvTMqEYcoRuv3Rh3nPiGomfF3`*LEvQ>{I~yvc^=u#
z;jszTRpHPy^hS>Z*6N>{=+CU~6P7BV8bIR$qfkqzqLod4sC~?+{7n~^6z}=hwe(6!
zb-VLrvXKvv#2(8xI5W7r=NJv!=Mxg~!5P=Rb;8KTHhN_j2BDb`@jl?3&IZ869tzx8
zAGjFZ0AP{Y!(ET)=#I3@`~Tso{*e?q=o#J8H~~&e{InoBM~Mk~*TV}>LXP2>*;Fp>
zf8~WdviA1qm#a!jU%ldRNjN`*j0$@@Dfn04Jw+D-(5eGq<c|C8PVp&E3@=dZpVDJ@
zy5p6>Tiz3`0u=^rVQe96#?v4C?@cUz<BK@xJ%=pNU9&$<%aBPtR?~_C3W>K5G}!g?
zKkx40^z}7jlBW0%eFPc4$Y<8<TTqaroc!qW)r($ZN(d}I1{`r-0(%rO-GSwmddcag
zQxd}juJ2j$D`jL9x4f^j+g-MWZjBUz6A!$ZwMP<>-At!NW)yzNsJN%602qw_Qh^&r
zs`D<Ro?$ZA4M0Y_A(jfu`H<?purVh9q7Y_uEQV_#WPIA+x(>`fp&Wwz`0v2*a`bhs
z($bZ=rK__`A|IshXlQ_QX0Lt5+hr0FiOzifrt!Gnj_jX)JB^#$M1m~RB{1{|4nHRZ
zCRj>_Um^ujT4kk*yV@_SDU$w(ftot{gEW6Nun*^=NVAHAsMXW#+D0S7%gTfkqn#`G
zSqR*FMJZCCw3o@^*>*<2WSJuT&F6W>sct@4+U!-3NV9I=XYCw&e>8tu0ZCPrv}4jU
zl6Bef<+GLkVv<=FisH7pD|_$KBv(}jUN$mpGk%pFUjj{ZOlV1_!ROIafD@ZzJzpYc
zB2OK1GXiY9@T;wk=C`)GX*uGVKcrDcU3*6WH(oVN45Tq(m#1$|J78QyqMfgA00qR}
zOk8>z))^-HJJ!i~mBbUOWJgi*;jW$R7J3H)^FP9)xv2eSdC*(EYqa61dX4c|0KTqp
z6%W1we|Y`(jBrC+vZ2&3!HV&$8po}Gf8^hB8(l6B_ZGf|apq<4#GdrgnNz)EBuPwH
zx6((C(uTohODppP?%Pou3m^}UO{pmF^;;ykw3!;RyO59Gpv9`KR^7@*OwLkcaf_^q
zS%H+EsL1=YAqtwpMjV0Nfe;KqvWhB`D$VX|oJ2}LP{jxg2iN$dNP*`JA1Bw2g@=sy
z5iM3^86j1>Ht^QV6bzPc5>w0HxwD1k-gaLr92e4!H7+*@7C7PxY9>Qok0Ux3^x#4`
zaZBWZ@TQI;(E9?`mOS#JDwDG@ouzH>KOg6cesGIo)*}N3!=qEvFI-iJut<V*1_;rm
z5j8+mi>&J0vJ>v90C`kUC4pr?iJw|6Xcwz;vw!hcUe0T~nd~U%N4NE%6&f<~0+GsK
z5(=x4SB0?MCs5BVGRB&g2}kH6U8yQC$uwv#521*T|H%}>dGr?6_?(HmnyKS0VAkxw
z5GuLKB$VcMDvN;}=W4WE0x4&of(E<SaX)jzZpw&AuN~Yr^P@5!u*CgL4HHvS@90(~
zIkFdj3i>NH1S_Zen01MF-;<0yw`9MB(ji*2d$z+JrL;jm14F?2$x5yCcQdS{)t`qg
z5dP$je)W_;)uxim_vrql5}hD;cjluupr{wkR+|RzegB@wkVs-_+pnSqBJ8|{QoysM
z*I6}tch_-I5F4tM%!SJ=XY~TWuD$War>!aEVgwAM>(u%+Z{rja@~z)LThK0q#{P7m
zPABiI2h!8u1HyBOx%177W-?GUf!#}^66{9w6F?uhhX*}vXB69dZ|p%D7jj!Gx^tRq
z+Q&l&E@I+P0Ho347W#jDvYHDpcFutuL6(2o>sSVf0_3!0MhkU{<1zNSb?>~5LZDK#
zlM@{8aDW37PU(*kTQd(<L9GMA6l8?gxmZC&NI>^{L>OCSj1<nh7PzcD8k*&L=JF@|
z9KeZ@`y6zYV>-}sR)U_qk`i8mxe`BVYDcz0g`~z#@_dJHzjfC&x5==w&S#c<d3eO3
z`*Qz%Rj((4=u_LF4_d$}vD_G&Bux796{49lM0uv{?j$w+7nLy#kyHN9E<5EL^|P`8
zCLtgQG!;Sh`q^g(i)0S^;Y_`W&-~xygJy5;ow!oXG=dG1r@2zk7yvNEcoj&o|DvC+
zNuDtP&pm<hJP79j%KL61+y`klRaT;Qq@F!(QuIMKya6xkXS3TAxRmFa-?~1>eK088
zcBWz@J&7AQV%Q9Jd*s<^_n0=sC-49)_Uy)%FX##byDOl8-4%QjP{dyx|HGNM{%glu
zB%sUYv$w8`oCUiis1PFep9e=NQT#uA!L;w}$zK&;eFFcC4&LUa8#oFI`8LEVP|$67
z%+?*hkcJA;2Ae`48NTD6uW5)Q=21^bwc{6_VKEqj%@%M$Z|?EWPgKsp@~}Zb3jDs~
z7cMZpbaVEB^N=zeZ>5idB7qJ%_rM27gJB>4g7~ZBi#Yl3;3y@If8Glm<fQP=g`uFF
qCH&_PprE`v`{UjIfBE&uBYY<%)|9qZ<_hFzmX%VHER-;Q`o93~vJi^^

literal 0
HcmV?d00001

diff --git a/example/ck_tile/15_fused_moe/misc/moe-2.png b/example/ck_tile/15_fused_moe/misc/moe-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..98d83866fad9925583db583e5179f139202cf612
GIT binary patch
literal 126766
zcmd42Wmr^g+cu1#pdg?KNC=34bT>n&C@Edi(%mr(F^C8#-RTh04bnLX(k0DMLpKcF
z1Ml*_ysziFpXYtPAK#B}`}o7Q);4Rc^E}oO`?2rG`l75TjgL!#i-CcGFDvs}6$9hW
zF$M<a!Cg$?%oTP10tUv-jqK}}>K;?uushzwoz$HUgA66FBk3(aHZ<7m&l})oD^g80
zSC8rzv8yr>gMq)G7aw2W-CuSVp7nCkuC+%+FFlq1o^a_SQMxg=+qd1<bG&+Kel1BJ
zCPfWA<j<d@4$~RczYj1la<QQ}e;>8f`5|xpedK6A5B~cA<16JqFI)S6eOb^JJx4J6
z2Fu@fzVkqor7^1zBNY?g=JsDtbbNG1wTtopms7T9RE;u!mK@}N6G1*nSf4#}3mUZ5
zg8Fkh7+COoiS>AAoHMEtEI8I*r<oMz!5`4+x;<3$;97z!g1^qD3ju~c@waCNe?TsX
z?1BC|`(g1m6zA_76OW<DjxHFTL}$z}Y9of2_Bf@#m@()R{{C$(a7_yoxFgmT!36U+
zBzM7g=jwq)@*~k;mW><?3|r%0tRmr$ZpU(Nc>ex<%h>PdVZqyeuR<{jtyt~6mIgdl
z=oWTU<1CY(Z(VfBzuZ8N>h~u0K>yy_Jj-AE`#=f&zw|>c5bS~UI)6Zea>6PzvZim#
zQO3Tw>$i^a*L!Qh|Gn83l;dv`EWLo6@R5D`5%<%mO1@qpy)vV@aU!p0QROPS<a5j)
z1KDH1v9@ubK_K8FD(MTVCPlrf{7S{nX!}P@cFViT(iRMdXCME!3ve!BEMeJhpaj>H
z7gluO2|bamdsm7of1FtG!+-4=u>8EZ`#BYzU0vs`!@5w>5kayz8gvlidhni{I7?){
zoRpUh=3g)EKh|_c)#8(i0y6~@$$8wUJc(7wWqxtd_`v%X198%~z^y7#@id}f_HatK
zsc)&@K9|{?>HXUjPgcODM)q^XSba1u4(1jxId-mW3loVq{Dg;{jb1&WbV|W>V~a2H
zzXb<%89s}|W6sz}C{%Ia%R8zkRyYZ8^;b3QJ9Z-d^OAu*&8!JSg9Pqkwwfwj9^8Jl
ziAbc%jMAjK3Q|;B+&>pxmH+H{F$m{ED#l-iz4gp|$yqY`LbVXKJ9p<e0QRj+WcWfK
zLX7mr72%Y2vrJNnJ9ltDd77LXpmq~}!A-+Y!;==?WT(R5E=<~O>0j$<;}j<~rOQ|)
z7%uL6P4SoK*7{{$g$Gm(W!tC;bm}rz{Ce>zR8r|+F}8i&z%S)jVG^AFo#s2t%(=Ya
zZsxa&bwdB;NxLS~s6-mkOp%}gIeys(2YlA*n|KWh2XCG(-usl}iml)Lth<Jn&rVK?
zK;`+?k4$_eJVE@943Ae2WvxaF%)&gsv~F2)?2gX}ao9{ss^{N=rI@Mb@aA%viVf*a
z+V#BZVMNF-EE6|c9JY2*N53uYmymBxptUf{QM>ncB}F`06hDozYtV+`8NG*&hV$pq
z;C!Mq2RX?t5AO`-CIg2l$>=DyDN#tu!;d@*1$3<Uzi=xZPGS6&9cNlppq-^&pC=7l
z=VI#VR8oq}Q4eeHVR<H)b@U1~VxGKUnV;Z)m+mpA#GkShp-28-x$Cri0~{)q96!Fr
zQu5BwWhZj=06OB$5aYXWXeoviR3}2bny0j;2#-04e3I?L>mfyroRxkurb<R;Kg>rS
zAuA=qKF8NB|1ENK_bJ<>m(*6XXx7&6V`sRWW7ls2kLV$nDp_9}mu%b!OH?s4^1a2=
z=pO~DlSm7!H9%6n)6a|>NEoZH9o+WT?xppv3!>$gBGZY}G>dUQ7_c4cL=pu|M2+vh
zAI)<<<y<6p9mW<5Fl#+jwC5W3lsy)}EPbJhF9OnVL4Us97n6IIz`w(&>=9*W<jBxp
zMR&0*WYsi)iTo}F)A7Ce&h?GMkcsvUg2b2NZ>xDSIKB|Zb1)X$?-2<T*F7~wXD(Xh
zSIrFNSn}Ot(J@J0;yj6wd1dC^J!0r2)wJK;A)T0KcI%4ZPh$LVIe7qDrj%k*t)y%}
zz5lD-fG<n*X?G|#OniaB@L56?)ysPfQw<i(QLpv#GsHLS=Y}+5-?2FG$D&Bm!-Q={
zZ5T8tMta_k{qPMHDK3Ke8YwgQIDh5llvd~HQIBPjuXEvGc)%<YkxTE~Kjy#uAYw50
zF!th;?ze+6voNobWKnb`C1%=tKSOQ(ne0Yh-8;MD2LvWeQKNa?{Pal1<A(ub<9d_%
zVZw|KAia6R)ZCHC$3f9d#8e#=sx&LG>E{k&p+vz78Y_4|HIgGwR(Ew}2J+UAu=C4Q
zU^X&Jzve`k^AU>kj|-GSx%0Ds6+D-SOHH4GpIOD4P5(Rf3iY7gf!|q%E$j#5tVWDu
zdEZU`*;qL%yDb(hqoiG%@=LNSk5u)uxNe${K)*5515i_Abi*&npZD>2mN(t3riw=L
z$E`bR)FL4|@hT2|qVdMuoNwo!AEC&NHARIJc$_)R%N}_MoWy+U>Z(hOxUhvir!8$U
z5??LMfaX1m+qYe2B#4cS?uNLLPlZZ$zdcUu2ySf8I>^eNGF7;^?kp|I({f69DSPX$
zYW>^eE#+1-*7;J2<ohqzw?}8h4jK5TJ{R&0iPaV~`9u@-n&NvMye3*1C;#R=mVMkn
zN!@8(Lj82GI7`vO&p~E)7ZR)d%t0`Ez&pC(#vxOO+3uIVUS0XMlkMEXO6$Db_f;&!
zg4@v-Zb+wQ3ik2Yr)Ai?!X~iFJjwdF*+-R94f`8?GBOSu@s`M>uV7;;*!ep$9fdjP
zvXcv1AGyZHu?LwV2}JVhLGi&o6T})tX<*lE+LeqvITLC_ncVi6)yU*SODro)WWV&B
zyAUiU9_(FAD<=CyQ+FyWf>v13&Ge2^X^a4hlZ7(d%TeFQdG?UeV*gL(Cwa06CXPx?
z4f9U2)34WDK6GWi?_x{k?n>>Bi5JwW$wm53DUqo5p3hVq_v7}!akB4<IpxP;Ldu&H
z{F7JZW$6-GFRX-xD~h3cJgh|8i7C=pX_$=e8x?L=4Kb4<U#LuEViFk-NK#-A{b{w*
z8dr_{!m1t^5U};o{wcjcajlJDhnxJu>wVLmd_s3`bgeoj=+_a`=4i)PKFD;`#}BwU
z8kGl+1qJbshn81%lQ@gVUJ|P$<W5i|y=Xtjd!J9hMjEIYqwY{#=PBcbF0&fY+BsS5
zBFzytOgQ;MmlA`cOXI8NbNrNn3!TO&RTWJnO9C?xe4{<bJD4|?*fHhm6C+I-3%Y&~
zFcCMto8I6eg4!04drEvemN9Oq<7<wo0cIM{#i!}TJ<m<~2EaQatqTd*9%|&#mdqs+
z1zV{((loa)PDYZZHLP*U_E$&eX-<K1N7ukA#0<p~Z`AWME*3d<iere|69hKsSPUh7
z?1Of9wg1AXLJtY{LALDCZX*4~^^BRXDw&f<?e5viMtSk-K4IdQCwC8}KEa^TF?pY(
zkc89A#jrNl#p8Q~BC+O>PAeX<*kqz*Ac#!HIiN3vSSt1jJbhcd?Eq`9rH+OT@a~ym
zy4bP54Stiifk}&B$;EFTJt59F{S1e6<-tFSb!Q-wTcNCAPDeE}jAPYx4zV+;)^Efk
za&TgDTl@!vabhGX0`e@sMQ}8)f20!`%fCx^zqp{TcABqOmW{*%Cn*@>sGyn~!_=ra
zS3Wj{qjml)zp<ao&-c7hjnd62VWs~9c)M05)T4UB?}Je$BHaC__cUCe*mQXpO^FTW
zO@y$F#`14D$b@K7={&cY(Ir!T<KtPuu}cKuzuioG;GSt^CIXZim-7!Qxl!X=uhz`>
zX1x!>OP*;H>pzb+HPuaf!Wy2(X*VhlzHy%0cl&OReobPXKWIBX*YM(A`}Z8QOnDC3
zi&IWDqJYM>cYe8WYa}x3(;5zP;m5x)@ds9+pQthoJmy~wW*^vE!%o?P!-R8q=rlhD
z25{3L9@Wllc{EI%m{%jN19k!@ZJf*7al9-EL1JEx4?stQAtjS$q)Ma}^?m)z=!vX%
zZ*TLKcC=T&Z}i5>Zj5gyV?DWJYQy&5te&e3kAYLi7is&dZ7;}s@e4*s#Bqwo^hOG7
zTj`)TFCL=Ntl!FX08gIsS{uu)k@=P^Z|w$@mC-p7+Peiz#bX9H^c<*P&ayiMiv<eD
z5%!VjZ+rFhr7ETUv%JW>9u-&WLGY;tm5#YgxQ`Bga?Ub!@JS|f{wp7MR#*&2{<QWy
z)$3}v@&v$$fIXeTgV_}dH|)z^cY3D7pSfj@)fLRT-0?H|{@h2w&6_A!q&5~sm9@?l
zIG|RbRFq9`AAO;J&I3N)P^gy{B-xSP7sJvBil>NPl9A8GuaD8$*wufEAN9KWL7*lB
z=lfm1b}0=~6ry0K8UVY$k-nhgXCyZ(rTJ)Rivro?%?9d8=3JNf&s4_t6Mpj>W2#A^
zbO-H2CTbVE-uoPb_jSs{f7x;bi~Z=CN&Eg8_h&Dd;f{bDgk23lI9iGNPdR<{BI=TC
zV~k5awDWO?XxnZdFChxnNl5;~*K-}=k}2kI+)`g$7Z7l8i`LyZP$aK<opmQ4EqLAG
z0o`XN9Lp6P>A5er3wWZ|1LbS0)@}9Ur2dOoX-pK)chVv6fX5Xvri@=)zyk{nmrNpI
zg2RC`Fkdp4io?fj7S0r(NUUXTz)vj%$J*0JW`#4GJzvZ}OkWpG1<ZERCur^5O)IyX
z4VaOhx4%-PM*p3F+}yL|Z9)9WFYxw9@(NfkA6b&8GW)XVdF1Fdb_`$3Ic{CYs}!s6
zp84laMEPUhI0FvNoc~NB+v+TWcID-+pD&XaHfrzRIX!q6Jc<2N^e9j)_*3V1fo&&2
zbNyD1A0pQGy^3(vblpA8&BuMW=!`}q>Ug$aij9lsm6ee4_66x1nVXY>!s?8-xnHkh
zvOoV=o*nu|h}NAF(H8RLU92R~|NU;MaG*GjX{~D(<Ph%+u5?VqFk|O_Xy8o|?VFKU
zb&frP)qA(4VDSSoeAZ(*uezY+pHPq9-fLeT;x~Q1#`)#V3FX;tvp1n{=1X}C@rP%D
zWL1#BN)NeBDO_3*e*V6w&J9wHOvbahA;L;6PQijciBvnG9?$z@m`JqX5586=cr+$l
zS?=kxY;t~<)ED&k@6$mP+*Vg~S<3q=KZDYgSlSZmRfy_nS7^IaXe4CpWPDC+T4Yib
z*H$InhkNe(fyq@}{5P`P@6Y`m`^2-TvRut1eJn4=2nLGDN8x>$@{?bRwTg``jI3=`
zJd6_252pj)ol%`+V3vl4MGo5b*%>{yEPo~HWf~R$DNmd?*ROd-y2A7)r#kV&GkM9P
zdE;ossR`98yk|^zd1Qxrv#d;BdAfhdef7*!q)D@QjM2<AfpBkCLf1)FX+(AdVb(bN
z2J;8Jj_137;|#C!p=BobuaTXV<U=7k#>q}zVcwvxvlIH)L;;R8y8CgE=XORHy6-<2
z9iSyCQ%jyGeouMaaxn<KhC@HikkIwh3k+iiXED}w)sP`Lwu#TRyWnZh2RKC~IGJB}
zm1J$WDs8>zqqx>huM^MWk$QB_42kc4o6DORzN$4XGjfwUO8xKdFt`L`68qB*MW0&U
zB-UHpueR;f&aL6Kwo&bQg3sB*Dr8Cr$4NA~4;NPiuEGImpg9|Y$oUjoN4qMkv|T}|
z9?-!uP#5`wZ>Hmj$sNNgq}O2Hl@~~c|1ej@Qfo1OZpolm=Bc#2`;qMf1WV5xC*E+#
zpz_LjJ;_sL>TL67Zt&8#nhP9J7e4MU`4aXbqAqU<1D5kUn)jI?Jw0L}jK`;u68aq=
zZaRyrug6>BPwKC_vh~7Fic1s>zX`9Q+HcDCe+8dnqracaKzva@*sT+e?m!ttJ5d;*
zfA`@`^~mxQ&imiF1e@8AsIs4*>I~Zqkik1PZ1LX4oweMc$6MsOg8{Q?EVdfV?&ao;
zX7(^x{m(!MH_k{oQ}C>m`)8VeZtWpFrf}%F1xpjj_c|}dx?Xw*cdO*DNss0?a{5M;
ziQ|$|7)e5|#E6*@br1^uytG`%Oc;5U=4pFD6K=l8`-)@@NzBP0#Th7<$yPbC^H_E3
zD5E*2QLaiebL6tV_ZVA5oL!!U)Sx-7a^vV&c*%>>(!^UQ=N(h1f?`_5*77hoQ?0^9
zwmjB~i$G!E6;J%};WLITQ`TOOy5679wpJ~V13w+H1-$mY8u!l>T!d1N&5nu)k|UJn
zYPUu4^Ccr|Kvr7jCCTctOl1z&1X_Mbr~7|#K=asdPC=_BJudiny`JEM1_->;gWIo)
zFkyIJ(D-%_n!yfebY!lHqs+N^L|bZ=`uxTEwh+ix#f%zeN|l)Ogs7JLM}(D&pZTG2
z`3unHEA5|T#)k2j6IU#rD&d%aXcnzK5TP2cgl=_+=x2M?{n<_{Ljp^~j$WUhp4r~H
zFHnnx_~p0R@Z7~8F?}ahEYGtnW<!-W^l}dxW>a(%Vq$U;Uc%<5*&siIU39W~skZoM
zgar6BpCO1-pn%vDZR8w*9p&I=C*rfE5-c5t8eLyv0I7YCbN(X4E<IK)_4DU|g1C0|
zp^;op!`lNP-dqjSNsw4FVN&yEjmj*cVFcRircPwx#{`X-m1<ql+>4lGk2+b4&`l%m
zFKqhx7q;iSwsAVoNyWNVo5z_7Ec~rY2N7j^{nx&`B9Mb|f9G2Ur_sYxqo2E!S7$%_
zs%GjXmEyQ@4dS^-&4Zr+gxT*hS^M%Knn0IH6x?!mdWpC?Pgs{I3T@DjE`F?L5us|-
zOW~A0nydfbuOfzRaXT_I`ID%*eZntE`g>7H7Ge9>Hd45u9HyBICy9?KA4VbSd;H!A
z<NoRS^?sn<Uf8%~KSj#wrx^I2Er&rU$9;s3=BvCcP#rgXjW(sIoAY$7ka3lVG#rT3
z!lC{rv;NyRZoN_cX`%BgohseRs~h(#Wx7rw0=%s9+0auI`!3H$5w}>9n8S6I@aV)L
zn!2Ab=}A8#4uzk}<!cggF&14ByM|)uBb$}X9y-2)?hm##h|Bd(xn5j+{Nf{<Gz2eB
zoUD_Dm8Cnm2-sejp5^ws%f4yNM^EoGdsuXwn8?pjGc6wm_381i9nZ!2qbK{%AtOer
z2ZWLhrpv2Q7lS^THJTP@ki)1kn)}ouE^fVDxS3O<X-!K%_jL(Zm9{hcMnb<fnU65i
ziKg}c4_k+aWeAR!;S5cF#(isZzp{f^J!(gfZMee!n#}q5f(_Q9z!Zr`PX3eC(#r9~
zm67rt=e!xUbX)IZN$l*@uf2MLCm(k?Yl`CPcD9`)flkNoPp6}{iB>eXpPSz(G>_OJ
zSLqOZ&V8Jj4HAHgULCG9u-v(<%BS96HB6cC6;gjj6rNaR?%Hrs{H=8&t=@fpb5^3G
zN$P-QY^ro(J4AjUPdlS;4LxPfbSRKk2R-UQ_t0KrC69&P<tt8(%^kzPuxp?27a3=x
zsdT(SfWG7#;@e8hC#>*Fot~M``uQo?xaRA|Ws#q41w$N;uC0%o?T-tNZI#)TGATuD
zjw^q_>>0DRYaK(>BTQ*&GJE2ZDz?<+st-bL?Z~Roup^q6cl@Z?_%p{S1N-&(GhAZE
zRTi{aj(?FC#y9dhj%;j!qg5p~$E?@;G+a7PIo8&WH@@qS_1+<{(q3B+s}*igVu#v%
zc(?x3nI63M<P4VZX%XEp|M;H{pG-_T(Ge<qR9fZpp!OcR9y^5}moi%D(z`yOs2pZM
z64OTd%-ohUP~)A{3D+|`%P`t4(_}>1WJOV&+jJVGUft(6!VmuVxm*)cwtIj|)Mo`{
z;uUb`;;M4PQB~@sZ%8bt@7+0cFn-@k8>eLM(nPP+HtzXF4ap(n(Y@p2E6eWi&pgdb
zAc4gdPV20pt!`en`9rJZ)bGwpA99V~SLm#uZ}iFc6^%D>LTcE3EYcHRo$MlYlhv7#
z>BidM%Qw-qCfDx-^B~3c$Au;0P_y<Sc}fXtf!)F5CW_a`wq$ju&V8IIgs5xdzM3rp
zY!9Xj!D4RXlVlAPp?iXC$8}waJdDLBWpom<Fn*ynLHc&P(>`1(m*!}?=Zii?O?dFx
z+Wa}yWBcMHQPah%ewejZ0rB$b@sy8^IGNn0sPXD%O+CMjL2)ZhgJO>93Y5RhM;00M
zEDC5;ML>6;Yi3--qRG&D{6B26@@u#WDem}!72A8I0+v2;t|6B`NfQ|!{ov=9uRM*W
zx}J&gMl!Cc28Kao0Xo>r&LJbWYdcV;AHNPe`VEA=qU=#rkC(u~qw`<{7NltFgs4}i
z>9ym^WA=YW0RaTVcgl5rz5g=qqxax!7!9)Asm4<yXwabI3brocp1K#AV$7Zv4msM2
zrwd`kkCcB6<OC16MMxk?uTHEaWmuhc@v-OTW$BE*`|{#9S@dCLy$ZY(-RLkq{e?NG
zr1M(5dSY`)i*!eV^5OES5O2dr_+TkrgPnx0&1&`1Lo@H;SQgFz+qpAwlN&uymKhaR
zzpl^rLF2uqV?Hq;OyJ?`n%rHeX*Zl@<AqBxU7fE^`+hEk;fDEub-5TaHQ((`3{iRB
zR35D@=qn*19^4%J2@hu%N4`Jro2~_7W6u8jDXE=}GWRHAVvNVy`6iBd{>?A3#+>G|
z6r$Ce>wkAZOrG9m^CJJ1AP6`O`$avXX-PuppZV!MBfWQKC0u}^BzU03LdrsK94D1f
zICSB~B(u28mK~9zXL|Zk4S)iL6ESSqQ;EqjTWcJHW9ybv>)Bii1#&XI&n}y0O#>~w
zh<LwOiC`fl4e*uf*chfO3lgLper@2Ncdy58>#07vg;bZVQUqVEVu>lJ#acZHe8sYB
z08jp)BqrBS&-s`+l6a9e0RS;*QDWhosuqt_XJ%#={e0Xb=d~3=L+1oJ8D9B|=O3b9
zJ_a+RMy2EuCCZ=tp;ZA=dXn%DMOrcs{`$Dsz!yXtCqTjMoDUH#S61f#nf5L3Tf%+4
z+xpw>jUy?4KqvD*)ER&xdcpih#C!5D+39c26(iR8&jm04Kfb`kzmD=Wwbos_g|qy)
zi#d3ha-9&{tyH7@%H@&LrOA(e&}nFFdY1KNYcVf{wX_E8>f-Knces9&nD2;z-wWGi
z;m+1Q;s{c_e1}fLgHu<|jru<vYREt2D;#JiHdc<JVFT)Q63;E-H_1BNxw#%XF!7!J
z7<S&ABTG$GT{q<^x%<TtqUAwpbgeq4q_ekF%6KtBQKAC~@r^1-%9oo?=mX>82!p;E
zDBS}A!OS**daWFlrGJPn!M{>uUONG;$=Bmzd|1rs>07&@$M|sh-&Ww?{MG47YP$#N
zOMLgN{f0i|bt|5hL-@^J;m$~|tDWjv-A2k0dG?r?XdP|x(USbCXA_zFP6?|&X2Fi3
zc&B@Pc0&Zg%|%3r11f|qX5lvbqy1|ko#RZOeq+tIg~}bsi?cW!*-i)X#9wtzB~q9B
z!!VSa1GP%W>6iu%G}jeyfDd?p;w6Xk4=K?PS^guP)8cC_d5!ySDZ?W8=d&Mmtb(15
zg6&Xv=7@wyjaga687wcnx94nn3tR$kVC0z;z8ctW8=FIpG;+Z!c1On$(~3{7R^4EJ
zbB>!j(-aT=QteAi0Kqc*w}Pw(ln-T*|4`<_@yXy7<^!Az%ivFIB{O7~*Q1Y*K>lAl
zQEW9xl%5w@Nj=i#45lqz;G<lNM7XYAeAUl&SERDz`lUNCyRB0b{F+|R=St4hvL(Z&
zoT5p*9#ZTK&5?B{ROP8oZAdB{8jHAVr$zU1o0f1PwU;|#k*NKZ(V9qHVE}d>U%lO$
zJ+?>vtlV4j(Z={(J!>bCGuSXbP&GO3UJ)=+{kPY)2P%!$LV@Mzt)qRQ788$6VyzWw
z93B_LM1LaNisz&&S~0zoHQ0TbIaD%vi#!~ZrR?V>CC2$9?z1kfH196Q-BPf@x6<|k
ziWVFt#9iXk7G#waRMJs$?pP-Cbo-$H6=y(^PqEyR2;W}h4^%BF&=}M{-2<h&{!=ME
zqoSPL0~tpt=L$x`FvZin`F@2HqVeEud(P0Hnc^%xLYjOzmS0;|%^!H887qa7&1#U{
z8a5Z#m9`roET35tyXHheAJ+)a1zIqXnpB@b#s1O)PHFo9((A$(OC+)YB?R7*x{AT4
zA)I!-s83O!DR(yuQ$$`J)&W&K|6NDT1fhHm{>EsQ^N(@te|QrHQyg)*E^hE&8n_7-
zC3J0aMVFs7{`6RVTigvHN$a5A6#ZlVeLEmV@9{u`7Mb0m*Tp2^_GO?Mx}D=TtekPA
zS{vsVt^0V6O|{5<rx<WrwdFz<lvX?yOr$DxQ)hpSBdJN<RJpr6rm2zxjuiEZ`*_t+
z?eHMw>-FV8j5W-FZQeoH%jGbaj^(xo;k8}6L#C%X!3k^GKYbA6MgDDQC-YzOy8u%k
zDspGx<`NykjkbOrJL|X}n(}r$$G|O^XD63Qt|j+~&u(1o)va!YA*oEo8+FCKb7F+-
zO2oy`8Y6G^QwO{^LzP3K#Cb2^l8P@SDEUnN3NNXgWi3a_s91dc1ysMVC`lyoNV20E
zY?g~iDYtOM7e68qju%r`VixWSRK=#}F#8cwG=I5+ute$fJ!X>w^uTUN&8c3v_VM-m
zXa-ubt-Y$r2?`CGv#o;@e%5d+q;l6<O(M@)={=ve-ZKIKngX?03xA*Uf8V9K;w;32
zR~!OBEaxwY2$MEc=p}S6x6QCnlXlr)@lm<*gGH3j<;&;>z4+v~n6Ihi4^?wsMZ&oy
z28(g-MU3z?&5{6ba_#V(_>Gw<tHaNB8&{F&=b~BnnNd(<TxJ2aq^-4;hZOUYBRMW4
z>)=rlbf>(#R6PFYYTO)M+^d1&GsdiqEVpgflQXJhX|zwpg5M`OzW&nP6u!C>+I$(o
zbB{r%F~xH^4IlTguLy4ws+&)z;m6jL&Nb3M-E@?EybGk4T56j~N=4pXhz_EjG(c|p
z;Q|2aebLU-EC#=^7`ycMnO&i}u6e(&i|)Pjiecrl9p9{q2@1Y_bq!~uK~x*e?nNd|
zho5`ZqPZ$vN@)`Kid8dgnpkj3W-gsZGSG%$g%8$8P`TL^9Y;l^6w2>`krX+>KS7A`
zLG|8MxY}P+DSfu$E}6)|ULEy9t`d{XRQv*F`bAm*YPi#l2O{%TxQ{M<tIxc=8uxX!
z97ye&54rNa$$hIdQzE0q-CW^&ku7eQN~w*ckH|^AIj?2wc&<&ns`~p>&<;N+Bsd9a
z2x~&Xi*xHE`EPbUYZsnUHF=29|0*?1={PP3+`@kzrm}ZNm0CVs`39%7WBeW-^+@we
zWCwpEu@buy^B%|-C2={n=>xPcPbQj;;w-RKfhmTwM#_9Y{}X)ODf@s@HgM|#l&&y!
zN9|m0klm!Y0N&8(C56(@B!iFGbgqjpXi2|@WxI`j5Tg@`ZQ^h4V8$^lUOT5Mp4*!q
z&twTZX)`E>&Ujclm&OC#X+q&i<7^Y;L(nsEcAJiB6PhDi^5q>UH;2=Y<_hcL64gj#
zR*AxF)?x*CX>CWC{o%c-u{@6x1JOD;fTHuAakR7z3N-T(!z&`VQrzeWM7|_^01PKN
z2m-S!ACOMC01Fr%atV{R(gt{{jw`cx#Tn1D!&OP|kt1pY*xvjXSPC1W@^A|MQ^WV#
zwheXl-Mb+z;WCk$9j@CLAi9Ub-;^R!Os&eYLvQX-jCvfd-Jxr!%GaS45*J1%km$nG
zgc5T$@PSzh>KQfKH3i*DSWZ}4=(@O9fAeT)QA7}2YM=S=XGt#9;vj()j-KC}UDn<2
zeswENfWjnZBo0E+<R|dUHSYb5$_~YS8;wr==T1`aM^7HqS&kDrMT{^7KWlK;Z$twb
z$}^c@Px;77&&0L?TSF3^227V2B<oN&S-ksXp)=tuC^`GRr@`!u#rblZjK69w)!9DD
z96kb`=$dFP!4DCY-pc!N8CXTwBvyT~VSEB=zYiD;TkHjf9IHpeEsPTEKOU+#gSrB^
zxuTuU!hpopYSV&IpdOwPXOB^iium};nN>tn=c^a2RQOhQqP#CpS2rK7)C-nos%`<%
zqky(1vC%5o8`?ID>%|GVjcEX_gb8|r?;adZ`R-0tc}0YZE*(!wM;T?L5unaH>wR{e
zHzGOu$0(Z4*=~+O5e^*^50e+q_7<Z@;^?Uq00PY;eem=6F_5R{v!J=&j$r~mIM7PR
z3T66d4fM}4^o*x|k>BqTwF79AI)@{Lw{OXgR<_rEA{P!-HG&yxwrJ#W<wuyHmW{=-
zdssH`5sQf~z(a#HL4zZQ*eJ?zfn{LC$I<#1*6sAlk2RDS^VTgWp*z2amD<q*f;NOq
zW1BQVZVc19gLB)LhpI{M)Y(UUO;aw0`fZdh%&v#FPI14c@t-u*sY#!P-N!S(gjpM9
zi0O{@j;uH9l;ymv3RbXIzjizsHbhcWZ=<pW$s!D!sYp~#*4tA+N(SB)hHH@+s5jFD
zo5x3MbYAFk`!qck>hY><DVr5AZem*1S<9FgfA5`eOI_dhBgs%|+|@|l3)uXJE_jYF
zn2|QX0hg<P8X(rLQ%5|LPom`;#tTJq^!>3(dO46Tu9JtWzyuK+T%vq<4gYlicH_k9
z5%{r{uU%8!;CL=HcWr>%MYp(kA#D`g*>#cuem)9{ieuk_TbR233>X?41W?S{o&V`0
z^Wg955P0aiRJGvO`V)X3jj0#dKxXBXHhi^UZRT;fB0R*dv>7he-Z9kL-8*!pnJ65a
zz!T8K<Gj@Ay?{{6j-D4%s22;R`@|?+zcc8V@N&#ut)Y3|rwu<%s0rE;b=qI`|1>Cl
z6s{zj)@<z+=lmlrVw2zN*q}~b^qW`V+67gWq6B3BMD9nkq|PXJ3Kh1+5r}jjOiCqm
zia5|0qkP>wm!Z=1`GI+^*kQ$@OQRH3Xvs#(GOGr)Ox}@wKiw>yz3VK`uJ18;>_jPa
zj<-)C>%0yzK1*0|?OvyGI6wI$?C2=wfM!(W^FC;>Jx|RB^)=(Qs?$m!!z{)7d!8RK
zVm0^Nu`u-T&!}=S{x#2+h*ortkG5F8<5`y!LAnaQ?R|an=&?Jjk<Uy$4ZT)U&uWUE
z7C3X6z5e`*XJ=D7zv=>E1C|Pi+Zk_E^me{L>Jpl>xCD4Z*PgrSjD@|Gkxi>{a!A$D
z67Py2W=N1T@Gb5?&}Ymd8nP*-uyniW3B*Jb*zs8(W9Q3Q&c>_GgE=}FqR06Q)Z<|T
zCC5X|1|NqHU7aE9?0K6oy=l&<ZBR7X!xh*vtCm>LiO6M^UT?Y=^!rTHew^CDjP=Fy
zf+J&^F9qMUSO&`tRkFKKT;|d@^SH1ShLRa&)j~<x8tiu;sh?tGlQhm^U}$0e4TH?#
zCJIK-poCHnS5JRtCP(P|_bk53gy)aj^+*qBC5*Y!2G^28LsPRJ-@D_9)zA1i_^AHa
zGfG*e`WRdFZu@{u4lwfP53C>eCkMQ(7mjmv1?NL;M6Z3k!e6S8?5m#FrZBjc4)T>i
z9Dx8l0Y_Vi*)5Y>C0jt+cG}-(rHtkAe3NJOB7%QK8l_N~13j)FPzF8><IdS*d?>sH
zUm^y&dYIM@c$l|>PlvriL{;`>>PtfhB`4@@Lp*~i)E-%`iQ%0hwW*lL(4Zd%@V1Ax
z7ck1xrOZI@`PJtMvh<wi{g!+=Ia&BS<DMI%vM2PuDsU4!cQT;7{9k;b(HcJDvJLVg
zpqUu@7;T;Y;>8yx&%t1L@>rpnPbCv{O#?u&%E<t-!MUd*Id`+%KUC^w-xgkFiegw&
z5ov<{%8CF@ls2y)emOBMN1}+$|DT@ZzISFL$k{0TeSkIbo(bU<sGOWt(-73?2I<AV
z!$foji*9p64*&svx;3s@3|Z(<Ce9w&xNN*O$n{hqI09Aohqw8$I6p1<h2<Ta3=vsX
zfj{S$tAB<;%v$o>#W8^X>t8PZds`x&-sbLo*;I6?0iw+2pDx4OUDumVy244=0aFBQ
zUBH-8jdP{CpsLmOPSo-xta%9uE<(~;qP`C4&FPA2-Gz4gHR=)WjhHo%?n>BXPIB*1
zcuR~%MU5u%UY}psaAsItTsLk5yd=1u|FY3gm$G{t)+pk1om0Q?qai2U>3#o8W)sv3
z8I52a#7Bzb_dKX0KR{++0;8j9_o0kphXZl?l5#yL+iBlY>7&4e6A+`rEV9ut@I?#j
zHI6phBCL*>_o`TF`xJyNa^07cF|Y`>b~PZH*Bw)YyMPY{KF`}SS|)uKJXMaXaj$j-
zT=IR}fzaz7u9Y^FP-Mc^t$``1cV~d^YQ$q>g{}TEd_n=#hEC1q^lY+lS?Zg8kOTYV
z_lvU?pl*<lRX2y3?t!ZBksXKa+*Tr0a62Qfa|fLmgp>1_#HFY|0j-%_x{$1Fu;cz#
za{dui0#^g`rRKZBV08b(A#dnC(e77%^HA@_);P-Qn~NkdXs$MV=$>p%Zu3a(r<~R%
zE;V1_hZ@<tbJ@OAit#8gG;RFcz0AVCTl6*Orw=tZiofbp;v6_G=QgegB>m!n+1V7K
zES#>RE2_CB3$^=Vre{>suGi)n5D+t6&=qIbH0)}yab#g4%pbaT)$|EK!FUHJLf5mm
zFI^BfN71p%rI{VgzXvw{kpY3;Qu0R7>wL~cU3P<rPU-3Vmeo>qn#6{A5Ymc7Y_%(D
zM3fG0YJEwxhYtf)&efd}PUpvOi|{O_Ael2-E!ny?Cby+q1Z!pU<ZrIFvw&D^Jpe)%
zn2_e(1D&+Z5CjdjR($Y>c0vIQ3OgQCyZCJTdQx4o{Lh8#D-BmTyPT`bQaqZHNx(Y-
zC&;Iz@a(F5WZ{g^`X)aFCzHxdd8Hyw-{oHLHk*Z(q1Y7F8p0o9k;PPCER!z7R4KIl
z8d!iW1aRzb)?-iz!PjZ`>+vB<C(~&K7JG7%Yt4ilQa|}VACF<2Ij03pRVjj)gOqgw
zGjwBLcac|Dgeq^QuP%8^tVp(@?zGDf+-?1s-P0Md+@$(D5I|9%cskkXzXQ%G264Gu
z?w1@FaHNhRzvcP$TlZ5>S1jA>bYsCyQlj9cvrcn-eo+vBhQq)}_xOWq39iBcruc)2
zlL)*my~(8`G2srp?WF+>X>@g$&gNs|3}**}-e(1Rnzg2(T*+d&?ZzxZ!;a~tuH#9O
z1==5`OHynm5CUJ`xgrlX&#seRDub6zTTKZj9fPy!>Nc!*&H=xc?duZvqzku=!9={u
zKv=kRQM+^;o)Xo2UxFiRE~^KnzR$+ssU{+6Pgimkq!U3h;?eRVW!efMl%~y8sm}S@
z<x#OF(}HDpBTaX}=5`X-JM~!f`vs>sB<^^Lj#H!y@rh(_$DLfPn<V_z$FA0+bw$0_
z?j0~iTljn1J|q{&gRtW<o9MH;b2tY-b%@9c-3B)Z5}xe8KOmhnJ$Q<5sWbBAa9X5M
zWoVWZAYT4P`~al83YU+{9qXTQFEUzjwkwn-zCpl7Q9^*&7IYYVMZi1p<;ZP9k=mNN
ztj%?R{pxPa#JA1?*CB+71=|yR6NBw1(k46LL<2EU1LCX<H1?KDUsdq=q}-YVsljIj
zwri%kCs{AxhfMqxm*9}@00ylhGt+mf<^%1Mw-3r&xRp_^Hd5Ev5U)fu#h^8UA1}r@
zVrGGaXD=A?zN_N^oZ0UgRR?E();8`d6x->EEi?C&6a2j#eWh?P(Mqc`8vLb8QlNgU
zaGZr@d+%5%I>g&l;{0ulY`JtvXMHdI+h5O~v(zCZxud{|W*c2@^DB7i&#;+;fKi@W
zian4bFb6x~hXmwd=3!@b62@!3e47ymY&W*+iZ&DgZ{xL=!HMV0<Nh{ePHl5a2RfEd
z=PIIjwueY<rQ3+@hx}^%0+2?Y$61Ho=QUtNH<{0SZp&6Sx{p4h;s*7Pcq=9o+Fx~Y
zU5EI5>1!*R3>o2tbYov~zqiLlObVM-p3A;ZiG71J5Vhri&U<G<ZEn^K@1r*NCT$o9
zy~m^FpD&?{krbw3RJADPHHv5pAgi;oV+3v>!et>i&ET>i+Bb-a@gd>PAAG^F0LCs7
zYjV(U6X7o=yeV67lnX!5I-^RU6_X}8A8>diB6#im;&M>}^!u%wK;A%u$e*_DlJLHZ
zC}v{okURTC6K7pqce^4J0faAJH&tpJlG)K4r!?}c=PjWGqtX&bOeB1p*`y@23Rzyw
za(`T)xQt9Hl*x$l!C}RZ&>~tdw&$lL*t|NOS9@HmyPZb*&DsrVO-Ceju8F3kCXhZZ
zC3(LpV{x6nJ)Ychd!B{|2G^Mys5S(tTF1Xamg!GRu|{$Z8{MAS7FF;_SDJpgq{b^Z
zjIx~t*Pyfrb!r?oR(8#O&djf3Qt_YV;_s>xxnVJ20DDrjS_Cd^t<N7P$mW8ZaFYjb
z(8k2ReLxe!#3DNW;2tk2gk4;xGo0BL&1SxQ7X@EZ(0j6F(<Ky(*`#Y!FkqalYqX2<
zPh~fh+1)Gh#^tpR6%Fg_I8Q?w(^D}cMb8IBJqr~leM3zF2M#~1nPm5|f}6~mUK)N)
zzyhR$zgwc*JkDsTq#E?8FIsWzdF#}3E$5TnOhD|R9R0qV%piYzsumA~oiK9-Ybb5<
zT4(}yXs{TLzvn88*gqegiqV?ZEIrdk3NWY$Dr2?)ju`v=YcGmRlB-Da+up(D8i-@Y
z7<Ha*AuokPJcQ!PfIz}OgLSRTngaziM_KkXb$Y`w)57FlLS@Mj%i+5Bq1=q`efid^
z$U+~9FLVM)%`_!SsIRTf668`o4vh@Mhnt9zl$V`r|E{ci1wc6biwzLtRIOKi15HzD
z#3Y>)C~yIWc@OSmcc4DEZpjqXJip{bDm}Ses~0Wy%bkdeF|)f6XZaSIPCwPzH>d-)
zMkPzR*=W-|mv{gWMl?<11@xQPavz?=<qT(~A-AVYt>NzNzV2th=P;eBeTNlA(vZgW
zD>kLN*n1ysZY-0(^$qf%*p_yT9-6nuF$=)k?&r~N;N35LD`)OoM7->fdT2(i*;Y&4
zvBVYAm8d%i?4^?GQp?+S<E-buZ-eN3ND4Y5^QIM(D2kdfjln*&GD;w9WI-)fhUq;K
znPrcf`PX5johc<b!mx42OVN9!`^J`V&($C4bmzOq<|Q+^FeF)Af}QeLWydGQFotJF
zugQ^%MZly35Jt?aXFcPWNl9#^tl_*q2d#^&DUa8_yR?SG_T!RnQEitr2%iyb6ykWW
z=qSLX-JjhiMBQ1l(C%t}3`=`sk8?OF3SMCocuVRW4-|~9jXn#96U5i7P}>8&D4nQ?
zRg3hpix`V}*FHL>y3wmF?MqnAV|$eX{$*jKacPIYYO8I<z28!3JJ!`eVF$fA?vwBP
z&j2t$<z<lNqqS?`b@r}h?Qx_`?)Z#&d#1t?*aI7DLj+f{VDG?Jon6@VW#)uhY>Qe^
z+Gtya?;~yDn?;&d04=6Qf6d<(up4R{eQVI{J34ZttDo#GT=wpb<d;jT8iLH2+v1c;
z29RPE_0JK~PSr3R_#v)G_Ex6uXUZHr1(6opH?7MI=uL-jATCgU2!}QyA@^xy{~JW6
z2d*v+Ol&2$<#>pl4ckR^>p>&OXY_tt@#$kS+r(2gIYpzW(4?0yPCCcTZD(7|t2ll>
zXBwk8L?ju3BnY;Q%PDk>jxFp(K-mFES`xa!=zw&#J?G!Rrlk-Yx>mP*nUpXnf6aXH
z4(~VA9evy#X)!4>^2AsAT`_ZMpcP_6-s#KEO-;ISq;G3KtJ&>+i;2_4)r<8#5J&r9
zN`!ZldD>gHVM<F0!uA2?OgZ!!mBAGW`pdC7oNk>*Tm?0Ewxn&~jMy7@!>P%m(hCJJ
zV&_8RGkBZVc$9e{?`=q(VW|B3S9H2$<j*^B5l>-zA`3aRgpW-g$pj&c%L3g9lgM|C
z#7;UXwi-jBi%I53XH}v1O*Y4?492L|9JGHrmap&e#^0)_v@MDlAmtNKr-BecDdxt!
zrEy(+SEr3QdZ6vUL;0x#VRV3<Mc-<as0r!mkPdB<Qjek)m0{8q+U&I;W$F+Vt~U9z
z<L~4on(*y@3lQEU216IxHIG!?4eNPp&1SEc=45+oUCd6E0d8PAJPqz7kumQl-|lav
z9Gl^=)$x$lZ&0DZxOX6EUsnCz_{kma;lR(hD;Hq)<`SrL{P(3BQc<~no?k}*u=nB3
zGvO9QEI`+O?%*&wND$-VHf;nRy#~JW5H%-aP>^RCQxcTBo&)RTj}KRJv;DGg44Fcw
zj6S7o|K<G~b8KD_aQfbFuRg2^pZiQVeP=*-L2QfthCc%}s^Jx(7r`({vf)(|djDp<
zD&E#c?L8r4zzTF{l>vPf*U7=_M#m6phy36=kI)C3huMYbt*THD#>r);01UvRFM%^J
zJCYg9R=ElWTD!O){Q|{`@9r*5cxbZ5Y9|t#*MDtqVWs54`q&j1&}sAvq+Qx-&XJZc
zfdB^m*tafpM}xW5NOR4n_{&{PR%wBDlZ&|<YrM){<DlmAR@J|e^)#`{FD^|$vpE9r
zQ+U$&(9#I6ytB{Iw=>waT2AS-&+DwngxCEFkrUEsg;Sy?)`P2qH{26>r@wfmmG);$
z88oIJOXXY7X6*ck^Cejm_MLS;3eoWDz3|*l<Pm9nmxWCkbP019-%YE&e*dGqybqG+
zyAv@aqb8au^L|*Yawu<jU36twuz@ehSQ=co4%F1IM5tS&q=w{<Nv+==O33wJS&c-|
z2^z=nID4Eg+5+=>q|vQxAm;-<yc1ZMtlY{9UR3!GrOp;%h(()d*g)#~6fo~*L`{xT
zn->LnZ91s14u2i%<ivq^amjriWq;1RqG-(GRg#8=2C28OiG*Am#GXy#0~rZmPL0UY
zt`BJ@b5<qp;gZjrh@GriafHA+J}T{_6wj!_0p#9>x_{x^F=PW?^A^;2njU=flJv`M
zE&C4dvt!O_-^)gu7?D8*WF?BpY!_|k3-4GW-)3t5nK4C(c*j9+jr=a$4jCW)Iuoz9
zdTdY3Z|9i1=upuHe1RzkgXNY-y}8+|zZkZci<FQE8E4gbE%A-S-&Z$2y7<5?23)8W
zceHOEbO-OO{o&aYAc|u>2p>Q@$C%!_3(@u2a-LjqVjn6$m3}h}5th5!!N&Mtj17p}
zHxy5LuGSE=<JT6lc*?w_&`zD*cVAl@?J_<AJrGb8?BFho<}(-PBX=;N`x4zm58X}<
zid9h1W}gdUuPenE@yw$@X$wI@CsMdt)5!1(xb<bi;}|3)I;!b-@?1ZclPc`X>4QU?
zc)ZQgJk7z?V=N!hkmCM!8sAQj8`Zh|u+=lr_kpF|tr<^6-N`JufPE^;aeVNL$h*KN
zq)tK%b6B5{PW*B)NTZK00HG*xY@99U0|_s~HNaB5;Qrk$VKTl$)QuS1YZ=|cebQ?Z
zbsu2ni2ae>UG*-IwX)~~5QuPgXUf+{<!T^V7#{l>`7q><cC3Dh4p+QcpNW47@B!aM
zI5?LK-$E_6_Fd8j^xd{zzA}Ahnf;5Wa)>P9Y*}X}vnS)_NQd;PJ)+EGN^fz?f_W-J
zvpx<I8%?coa{dtZ1h7^c^FH;{tMEdV<ST+Dr&w#dkgl?xbE8(sxmsm)8r*7){=1gk
z7dXJBDfhLYCOZc1X*Pa*c3h?qhEDyAa8Lodm<h-+)M9%fhOt14)o;L5T-`l<bpaMD
zIjvc3t=@v71-PBev#Ws~pH69C!CY0#U*IJi{!)#x-hN_bX8C*(r=1fE7p>V^8u)z7
zew#=QTcAKfEi$Txl3|Zi>pnX>AgTxXQ$J-g2W+|^hKF#W74W-9|N8EMd+8(U;U<GI
zB8R7cI5ZP~On=ttyGgdflv~<znLZUYlZ;hN0FAH(&_ajZ5xj#2jn?V68OC0Szu6B9
z+%WXf%anVwmAMCEkmf#zO%p!SZ93QRaBSHuMY2*_)=_xAvc5>?G@omv@jV;7d82vP
zXQSem6df|$<7f}G2MYgB+8H1CIEoB664Z#fC|JP*1V2gX6w2{y!XYRtg?;|;7&}Jl
z;cs>k&64T@Y>I1tB7pUj8gz82RmBo7;d|YflA;^OqQS$%Q(_1=C^ZmEdh5=<tFk2s
z_bfJRyOS2`yHSy~mX}jBHIivi1&puq6snN`LOuY6{)0+1@qfjrK~kxs`Qxx6uT_%s
z(Bq`2@tTkPilEaE2~mMdUQDu7J!hZ;(YFNIrZtnQRe66d&86H~m!<-0_>7#)ezbhA
zN5kXzrwqIe-Dmj<g<1wY#|p_gaP-Nvu*>RKQOH4xL$j=m%sV)==~c$l?v5+2)0R3L
z#x-Bvj?S$W>u<yBn?6FU!fJ>d&$BHm%BtdrD(Nf#YWLXIZ(%Cy`~Y&)Z%WK|&ltJ}
z|AM&4nRoU8#whNd!5$xjUS-TL7S~ElpspACX?eb!t(m3ZRN7FGlgd$s<;aTUXp6-9
zK}Eg=H9q$q86Jt<U_$e?R$BFtr6~#Yb>H?@J10)#A-fKHrJs)EKJW-=^5j?tHzOS_
z+lN9*!Ipe^Z`dWk0d5}w;|n~wM4(U+)YF;l=Xqn+nVFeOi;G>y$TB0z^MjQnQ+ao4
z>dU1oUSA*GY;VM%);2IC1sytWb@Ob?Pn3q}P$3@R1t=88Qch$Tbxy^#x8A&?{ui+R
zPZApYPG@KN!?!_N>(<(X`ubbKD*#A*az<5w1p2+4(v(KoxP#TqfOD994Qg<<Y;9Wb
zn@`h>+Hl1ziE74+$oT9d<u26<=Ob*%va}i*CespSRqGtN%;Q7X(-UQ~`+UrD`KpY$
z7W>Qvp`{7x&z^xsD&ly)2&v`c?wvdkr#w|jM-HtY0Ye!orN)+pU0j=Rhm6AuSm;b+
z<+ff=_!&QGAJSztLZ9)r_sf;{$cJ{D__jJD$dNGE{7kVIeusCx{S=OmuAuP=n_jKs
z_GEcm5aG>Qnr~ls^%Jp)t)(<Y&NB{{Z&&k3&7b13k5}=8E;GV97`5eo3x`rz3PBf;
zFgvZAFvT!4<A)%?5&wtFVqKyYZ(Zc1FNoe@3%G7<uLI~(tTWhChf_k&<Z;CdSHu9s
zYv9O66@45Ywke(XYKP$3xx*<1%GlEBa?r;>S6@Xyc<(EZLw3sB_*`y<*#eFHR$Z+6
z$OI!qNv77W_?~Nw#qu)2#>_Vr(=&8d<pyg_cQz3)3l}Meq#9LVA@w<)pm4s$uQMB~
z;?pEHlNcp#4zDn|h~+3c*Vb=FUykks&2DN@A<I<`3rPBOenQX#oVwxO-(7QG(|Zp+
z`}yh?FTPx^V4mBB&We9d#O{EAs~`<MAI0hUO~;Y1!+XJl{qbVmCa+U!9@DOGZ$hAF
z3qjZDOWtm%A$%)Fm{oD$vRdkOUBYFnT(3%1GSBHtYx9+-aNw&0X~9yG%`Gd~eJFr-
z%hPf0zOcIUM?5b7F_Ow;cCFTOeKmNVmMR!lSVk*)LwZ~!ZA$g?cjBw9MM8Y{1&Rmu
zJUas`hlIeV8zj;kQLtTS*_wKdx%HXl%YedxIY`-54)Xl{YIM}Bj`XM7%@Q`UOPg`n
zVA~Te;hhjjp|7WnrK0`ZQQWjR#fgsPhryGj$KeLe$6M2qg480E39$#=d#Q49t4-uL
z_je~7ox*x(26Tu>4D8=cXl`4*R6-Bn7#5+gGd;3*jjRJdEK-IZrOI;yO|mBzI|<(O
zSe{V5oj+-m5m8hNVAqvPSqo$8eM3h%Vk6W_;_PG8FVW1m^wyfitz81gQ|b|pF~R-!
zVj-KF4$zgYjb@9BHJxJjnOY~$CMXcL66m9mKzzz-TLjUA2QQ-CQm!Pu<)(&&4ZXU|
zI42ST!1AzSygKwg<?wfRKl7kl{};1{l|Nl;t$2s-3FjX`WRnzm;IeEYo#(+qzlD13
z>4;JxPgxG`W9XydieVS^EWF}=n)}1mQthMCPFC6oR_M(}9h6a&{(VoQhKXl&;X@Sx
zb~RzhNp!cQXNQDr3f#&NkPDY*yGE?<VE$O6yTm;kMNooi#AGKnU2z8VsYWvGLPVm{
z`n9=hO+rL8T3s)EFRuVAE^{XMqUR2Hxq*)6HKusoG6l@<%W>w=#tP9;`l?&biNn?w
z2iO39IQ-_SY~dFnC1F(|Id`>wn>#<N&g}-HGpqA>=0h#^%f;ebJITwdt39SX&(9tr
z;^nfJb8LmYJ@+2-=7#T{o_`kS)PHo)XIeN4$hp(q^I+in0=)mJC<5*^L}>|I$j3jo
zKAuBND+Ry#$nxX5O=pa1^xH|6;VR-YoY$`PsjmZJHIf`rtWYesqQ>hYe&MqbB<U$G
zqd4nywKzKDBTHH(mGU*7gq-?f@T3jA6jRS%m>U^<Gt~$;obF&hL=Uu_^lxh5Pf%WK
z@UEU~PeU&o6`Ck2qn>ldN|n;^n2ashl;27Xraq?YtaMOjWJ{DqeFUIUoi3jOIA1sW
zMMYa=RLt}W8nqa$N~)AJneAj3lIb;QKz`j%^8-5PW&U>m*m0=|m=674H)?cvexR>A
zI7=~+vG5`1|8Vx!aZ#<^`!I@%f`Edek|I)4Dj>oTA`+4c(xuWZIpm-qAd-TVlyrCJ
zNI7(O4c#zw4E5fF=Q$7O9MAK8KR^HC4`J_p@B3csTGw^0wK=gXB-IZA=b-6nci(Bc
z(17@Y+lAdqfywBXBSRlO&N=c!r;}GqZoxOkmuRUjuTTey0kPfPxI0A+hVYaqlU7IX
zWyGLbc9Wc3zhMt@m6LDGExIitwvC5!rKggdS^ZYpf$6ZuH!J<bG{4LkA~`o)Yix5<
ztsT|hFkMD?!I^slX0cE^N}*{q@c!0I>)Dr`GjV#y{n_;WZ4T3lsO}6Rh07-H7?Sm`
zS@%SU0#I~jum&-Tt9+5kR{R6ZV>8EcqZ@gg_wo*j6lLe+ejS;ImFX>#rQ>NA&YH6P
zOywLkL)}(AO9jb=z7pMpbuz-JQHPb(qVPMaL0sR}1ADF0O!#cx99DHvAMsP-4pe-c
zf|-yOC99h(9LNJzIYX6psRV|2{hdnho{%G7`XV7vREy*3yN(8mw|_j(Woe1_P3qP6
z;nhMa_#}ESkzFNH;{9K@?+Pl*4>B+B`8<MfwY7Dzh$aL}&_4Zen~6)2I_R_TS!|Q$
z#s=#^(6>ZA@1LYMJ2hj#d+i|`v<p3V*UV4>_~JJ<Hrp4h&VG15)gYTXHKkd7%zRt^
zJ2L0YL91nnoxpgz+hWdhnfgH07|AD#rC(mY(ecV_IYE92pZI=I3%RE&<;2F5joV%z
zq%3#bTMx)QFIpu91%pkn{%ckNDCb2ea(tSJ(&a<qo@3>g#D)i5?u-jL^U3eqlA=ul
z53kkH3~b2p@rfS=)a>m`d8}`cvPhqS%c0+&ygT8gGqT-yY$~(NV@0BG#u6cTf7ju%
z$DyCk0Ba}CNoF+%ZL*|o6$|#VuMQ`;uP&<XF2B5AJl}fuulj>8m}sLxi+P&w$v;rn
zV5|q*3^#|Z^gulPFQ1Ar6a!x<5eC$}c=dSz6`!CZ70ENZG(c8a88tpm9Y{||<GkFk
z8>@aOpSI1cQQq@5DI{b+RUSFubn+K>DgOBvF$UrF-a{!o-HYeSC!GE3-j0SzKt;;8
zQVwB=Cy3|)j&9`xX&-5cvj?fwD2yRI`}^ZVX%!WP1n~;?hq(}I!>9Z!qfQU++}T_i
zk=mYe>c~7Qw<8<1t&yF!b#RDJqRQSsdF)llKOZ|59-4DhKj2;v%RonDvG#N=Whr8q
zNzu&s&_sMk*E%}9@;UNqNT!BiZ#HuY3$0~^hCdBGRiXZ{)$zI=t@@V<*ENV{<?+w=
zD<<9FZnR2M3PrZI8#ND6j6<LB*DXOu9XAj%0u;(=3OegKma|1RD1HOI(K|L<0>=k!
zsLFkZmEpFZekiS-5{)CH`z$Ob<JE3b;g3v4%l9O1R4W7Ia~`;fF{S`50ovVYUO=UR
z!|K{LlQC?*eg{6KKsG+h*1@@0(ZOOU?_@ghfXdl=rdl4`PgAzQ>E$_Ip(Z-~qg`}Y
zK&IH5IVcN#VyL{y2D5~`L{ygX2=gB9QB;gq93Kv6yvz`emdU2a1177)_zBFb<uv4Q
zF)oh<69#OS9*<@WL9lUb^7Sl8P;kuaRbbI5@4xdQB;>|`S=>i0gl~a<#{`1P`Cu;2
zPb0P(Op3`i=t^|hS=mFQ-90>HqIkm&oP=j{OZmZT0D{kn`jtZkb`k^F!Pm&RRuH22
zcF27BuPEV2Q++=h33#ZsqRMWhtkl|mWPX0S8y;6%G@P1`Z-1<+q*;j)BE1=)?xZVw
zbKsNbz}({E-kn*r9}<)#k|H7^Nz##_9A!4kxi72Jwjspy(h=->zC}gQ8h8Az&d%0I
zh-e`pC9hJNLWZI_Pue0je3BekhfcI%P`aOS1BUS@IbRZ9nef4)9O)}J{qNs%7E^pC
z_BxMLe(5<OT&|Q62DjW>j~pz(3H15&soe}7d6QzKBT9{Jq(@IzH$!bNGLnAcdS?%6
zxMX8xq%6bPKRzL0q}(ph_#6&SxM;i5K1FVlN`YZzW#x>wwx4ijf4`E9j7)&L@m)H)
z(z3D}b4H=n5bK3}$32V0C%Q7vp4E<&K{Yqg<>yZA;~c6D=k?^b(fYrJiakl8S4|+f
zbpBz^r#-RCfX-aY5p5#iwvi4DOAGdNBkGNc;PrX&V^;3kz^b)k`EZDyh-p;#tMsle
z!^l>I)QbPioI#ww3}`tFIQtymgx(#=IP$goAiXIB<#>EJ(UZ1gjO5QV>eID}nA0gm
z=j2?vdbNVb+A!Cs&lu?JD5ye)3a9x*t+)5(nMpk`rp08C1ISm@4Hxi9tv2Ug(vi1K
zH-+qPFFT+{D?WX?xE|7=Q}}3Wp=W=k48@GAO-w;iVZUV<@P&`s>sE*5d}qSjn}7tL
zWd8fh5$DTl8)?sa?&Di<?*Wzd1@y9;s(WGjNV?&9%WnYDm5)>?SLk{1b>**RxT&H!
zeAgA~XO_9yjU)*R(K*EHBV5|(#;iEVs6#{{%CH%95e@`0DaOf*)w)rrTa67*dPnTo
z=i551ppS*27x7ke)Jabc#sNXwT|w3Y{Ri6!D&)~z-1x7B=;(;f(5RT5>p0&M-d89P
z5()Eo@NjFf4_ROol9h!jx7!%Coy7k+pN)xIDhc92q>qg2CFei;_|PglIyy>6^GjLe
z=-^vEz#$;7-=zp(rlF}p)BDTtlL%SgbmM@o46jWh;I$RD)PD&GAX>i|gFcz3;}qpI
z-2Mh6)5}{Mn*mK9&+<!=$yr*2Rk11GEBKJOk&nK#@u-yo3F$}BKxE>&$m*ffbI86{
z%Au}4^_bp;@?mXlwzj*PR+Xj?EjU<2X+u6WYq>DjH@DCr9m7(~p~s`8@7@aJybvSv
ziL&riNuuB1$mh|7at{Q?aQLj`+!xI&q>^cs72(Q~k5jv~Asz4l=#-2huzYRNIB}hQ
zu8gqES9^90Vi^Dnao2to`X)pas6FEis{HVYO$fw>obb{b7tZjTYDzfp2S}cC=%|A5
zmK_qXYwLpDqF!dBv%R|;YHVO{Upj!|Fd4L(s{f)bBPn^AJ!By%8iwAT^i#}KyG21^
z!rHq8?YB(F4ZkW9V7<`Ya0lzfgJ)K{ietj4x~8a$uq3g6>?JPH8vvXS9hC}op|#yF
zG~2?o?4!!a1kBpz_lVKyp!IIqlWFCl=aV1wnt@()`27~!GqKou*JX2()X;b3O15ia
zkQo9>Rmd{)pxnt>Q}4_8pBw^S()P2}_H7JT?5Y*m%*;hiA5<{67QaIdn9Q2l*VFRS
z$k1JFqd+=zy0o}{>kF26hB~T9fIXXQ*`;quRk*Uc7qUXX&S<zDs!>UKEIo3kZl%&n
z4;YDL+|p$xlR-DzsO{#|koCU7;M^oDJn-xKPVEsiu&Xetg4^j8w3CA{>{&`L?%qN0
z%Qv$tw6*-w4n?9Hnby&)#TPK)OB$YM*b^n_${&6|M$-+qYHE^DfSJy@EmD>A$hbSa
zg}CCKqVq&8)l9SU&eJc@BZth{T?$<pL&H?=R9|j<E!FZ0kbNIA-=ShR@TKG<B_}@^
z(8HX9LEcMEPltiE2JC+UhiztuSziiBsr4eb>Xc<rDpkaP01KGw_ax>A>V0QRVpres
zu#-jA$I5Hb0v5gB3aAJMY*u<(L>p)Zd!3D{X6z>S&)G849p}{`g&xGnO<v9Glceg2
zd-{F{_^}r@(YN9p2%>V<BTZX!UwuPDqi%Z!8Mb_%XM*2suXJkpoYkf*kgeRJfgjhO
z1R*9Cu%2FVBi^^W7avGsoJl^#RifAg6JQJHOq|V>fC9Y({G{lpGLs#a21Xgc4L=Dm
zE{~YDs6JC@zoox^$V}|DyQkR&xVxcXn*Getp@W9i2hQmzg{(#+pI95sa&`yAnz|gK
zdDQM{5|_8>(EG2vCSpmT2PHnrx<+Vpan!M+YNLo)?udMcRE`QjHK_+}ZRq(VT7iS?
z$22tIAyk@nYZF;Z5!_ou>P}SAh2=MtA2ltgfw_J*<m7m8IXLVPtSM|~1lrr{ZVWI^
zU|b-b)LWPJ)unAnuZE`{F3lk~_ZRNkHYVUoP{uS2Vtf{0Fd{bYc^J-DQ-1ZrwDjm%
z(4kJ$nMdC|l)V-=E~mKc$xJ3ziI>GEc?3(IH+OOv54PdF*LkdmavF<GT|Yj)dgYwk
zhg%#s>b$>*v)7TQU=<ZqKDcY5p!?d%a4(q5xp%k?DnNGA)bG%-k`{Ms=un%r-gIgG
z0o1KUCNtw^F*3;SL(r3juLdsOiBF%aiM$XzIwl~8sH2)GR({-vt~4+1%~egdhFPMx
zEm4#Gd~JyR^0u)jdmZl8`7te_&g$!a16@x8-rnGk+G))UInr6I^mBA7cV4USjMMwZ
z1y|f_Z40EgcM#Z^Y0dxWFlqbT@#1kw5xTJ;N|Wya2dC4UAqOT@lyj5Rx8`LXROKcc
zTjk-=dYP$%c0>8+xGm?yxxCnDzK*Uw_<|3GYLsTqPFEl>c5QZlXUyWeo)t;-ZmxUa
z_-J?I=tC6(6-Kq^+8#Y~ytA^^CjD)0qpCyB8FElpn2&JIO|WXg?%*HQ*qf$8?e<ME
zsq-<K1sx%K)~2A-N6XF!j*drL<BNIEb}4p5f)5~ca>RQ~w8kEN{>^eu5<^dea8(WJ
z%+>Vu8Ps3bX|m8J)9o@8hshogsE^kX_QWv4Ue!p{L|8bFLBF10sjm0PFMxl)0^V-%
z18i*9He*I9ANZ`thAxW#KzJaD-g&1>G#b6!>MR3WjbnrJAnoT1k}k6>D+V~{Z4l1;
zIonP&C;3RJI+soiZ8HhjEskhrlsdZ@R)_2K92c<8?`5F)citZzL1vfIAHq6Ud&1lJ
zEIt^?unnt+Zrg^^eL_ymj2sIcbvW#;=}4JS9Ir>yLSRt`Df87hqS6(62Q)I3YqN@q
z2&CP{rW8sZZSQbA^qGoM0J;_wc|D$BPcb*CsI94q37AiyRE}2bPbFlBs~yZL2Ko^D
zk`hCB=a1Eo8cEl)pfk?PWsKU5UQJq5Z6ilX1!>}hPP&zyTZ{3>^2gST@IwPs8QgKO
z51}Vjl^L|dET-n@v}1)?=bs0SHq`sCke!9?k_}U@5xx|Hf`Y0=rYtNh3G=qh8%#zV
z-zFv|ii;oH^rE*GwcfsERV#kdn9`NA0NvelVwiH7N-Hfb-Q3(9$kPi9Kpr3M`1trJ
zM0V^BQ_0xPiG$%bSYrl1P-5VRPXIv<gLVYpS?3+(c>}-JD4zVOdy>SV*UDBKj!|r7
zc~K9Xx`MY_akymgS5_8{4icA9Y{wtVI!m?*9drrUE^yl<?j0ZQe}fWGnmp2Mg*gsP
z+6^7&Z#PhVJa;r9<3Qtl<y+LdHu|mE(f*Bba2kZ$C&tkm$47n6&m_ClDtiwL_SQ$2
zz8>%QIUhYC20aQT0{b;cap$#scy!x0<g)$2D8xBEdOw8FkAlbAW>JP}`ncx61R-zo
z`h6etSV?nvWlzBQz(Eg++VuZ{9nED5n~4eU4g|;WSp0Z7WfO^laiD!ggPVt(cD>+z
zQM;3a^25`udZWyHBY1uz4_Q7&5x^%$f1o}US=W@|jLeC@(B~k{%gyb#PdhkpG*l9^
zd-dw;YiFxq4iCpN-Ii~qr9Ae%iBjQniHf8??k0H&Xx0Q@om7gR><PQ+AVpkU+&R%m
z=^~vbk`EV1+x90J3DxH@*_;=c+A%-81bw_@T_Zz%-MtSSR6g~6yfpQ?@vf#b(o;$e
z!bp8gsxdaGI+%5E`)+a(TtF?2j#e|AjjPD|qk)S(8f_XyYO&Ti?a4nXcof)Lpb>jx
zZ$Z^^RomJ6M|_j|NCrjZY>7ljrp=f2s&QwHkmfMEKzrPjE}H}+`**K3kDu5kqj~$J
z7i2cKOZ#Cu+ui;js;aV7nz_!)Tw2{YES$Wzcf;bF{pV#(^J$f~lo_)EIrsRsS6kW^
z${RhI$}v@Xs?A+OXX&Ljso}BCuV2r3Y;Fb4+%ZMNx-h%<0!*}=1v1L_*mqK|=8IC5
zK1XhOQ(PHyNp*7=yGh?Ik9}8<%*z<EAqsTHSb>$m)K%W!Kw!+WvhH$LYKSgjH|vyK
z3n`Q1!UC@a3@t0BwcWXbO{mWJw?)Kyr!@v01~%#kt1HwOC`)BA8N?Q3kWNq=#Xjyw
z4Z~3J!WT-MiLu6bD)hY7qswvNlbNrbf^Ai8cd`lo@f)3}%^j(7yAR0Wo4<a^NlH^=
zzudxQr#6Og6=I5W;QKGX*^avGOy@@?Lypz=#^v*286kYq=J5dPotneTjwoy#!-fyi
zTpQCNawM$f@{OKx;S1rCQt>4+0({TEbs6?BG73mb`-TjY9LZ<{(jo!$mSU}Pj6vf;
zCVXa|jmeA5;PM`Uhphp$ZM{8_?TfQtv@EU=5xr8}k$zTt`4avBUk?}QVhD=_dpJbQ
zQYE7_aR$#KBI@WF?8x50#OG+lb+R}%Ha0Xgw79tVrDm<p2jt5WZ?p_HOq6F8^5hz*
z-+4h1UsMwY^Ozi@p`mJO)0A=5$HhZRNHh%;vOqYu%ulW(BW9ZuJ29D?%N4@rmymJ1
zq*s!Vun*E=4n<u}^g_tjuh(wez`F<P$3P1*#pKLPjZ!NEQ&Ur2-Na1-33@g*)Q=C?
z)6>(F=4)5~eivVJ(2;4&Av=}8qxAml94egrSPo^7H*H7-<o5bd=cH0}e`kV`g=L?P
zE<aNY^?o8+M9b+_x15VUMPSPFB-JlpuW8B#(8Oer9Y{+_`3x98x|x}DT(qq4vj^4g
zl{OW4brm9hLKpof0zT~h&^<KLwVPu2?lnUT4%UyLTjZgS9jZQJ6{sx(YdZO@Lurwp
zGcGRb^73YgI^pN~^_407_yoC^d&1afI|>VZ3k$J{2200BXsD@UVq>e<-9Anh2hz&~
zxMpW(fBLj|+?2NoiPv5kE>Xz{juaC=rxzwkSD+%b;O{y!GxOue4>!CUDmH%G&(6cy
zSxriM@aQqL!=I@V`x4d|Ibe8w)nFb!`)*#|2@ZkNCRA3)9hR3bD=VuqPhVBf*V*Uk
zJd}R^{JDsT`+#vGRbE~mm+5eF?kXgHaG@tHEj3l2hcd3C*kZcz=TDJHY0;z3yg9}T
zu5Z>R>bSYNUGUZ!nVB6AxB419S0R(_!KxMSs!t-8q(lN{6IMMri}e_E5qr>hA!EY)
z<b@c6X=xP|e0ovYQ;^Hw#s;Wi9;CM_G&D6dGJ6d1@bCbAk=!-DPvJXxhlU1dl=A_s
zfep{3JM=j`2&w4wXzA%gf`cDDdL;7fS$lZc*RRhG2~u<)g=bWr^*m3^4X*UZA-K;H
zbOIRuT!oE8f<Q=6(9m5|-^^U+RGGkiZ9Rb^bFB+ZQ^niu?KmQrNdxAfK#&GuxUsP@
zm`zh!f`b%)Z>YET=Yh1Oq@=hwE~A0I{&%mu_4W1K++3TL;joB^X4a+yU+E->6FOhX
z>k-eSB!Da@<YU*5{8uOE<1&~UU<q8}kEOREJv|6YyQv%ZAk)5$M=AUxX<p<U980~1
zS$f{Ff_yY9{j?pZ_JH1Ag<yG)o){SY-Mh8Mj{@OAK|#!c1Q~Mj^4Cd7f+Hi<Ra6>D
z-h2r+xPMb=Mi=mvkq<CKuU9$#@zm^xt(8t5M8wXSq~c5)5h{FcBtL7JZ~7fHrpBam
z?JH)r_Ri~8u2%cEa61&1e19FHA&^ymz@qDDTu9r1YHueKq$spv;^*h5r$=-S`n&r3
zUs+!sEHE;VrM!ON8{|F){dw+W*Zt#ZTQ?zAGE9QwkC*+V2Z~VbrPf(ok2Jnhb#+DC
zZuM9VS%}gOpp5&osp;uGGV&)LvvP2JWzC+LMP7YH?09WhayOEU%msOi7*kHkV0X!$
zB*6RrSXixRFh{yrpj6#kC$-Id`39?6zCd5wMVN=KI2{heX6Yg>?qI34k%0lD6hbRZ
zQ#q~!#1%wcT~1zamL?T3vE_L6d0FX5UwL><8%isVz4h5uLZ|%y9-x0bO4JxsWT+P(
zMQIhsBpB17-`v>vb}?gpW5eJT0i)mN&qPapnSF~!Mn>~RM!o4NX=!nKCWHBmoSgEU
z5m8|v&y(Wv4~B$>*0)Yz!`V&$$CXwg6D9*|D^=#?O}We!E*<*Xb|IorW#u12ihl&%
zzb^y^eYimh7sq0lPMFlB4+beG+3_Zrb3Y3b;XB~e7$z!r`1HXe^V+ZcnV4{r=w9DG
zqg;HjB2rR@l}=o2So2u@7Fb#E2R}dwAFQ?q{e2tX23$NV-hr>n4>a>y;p)m83SO&!
zoVNP22fj^b;c>UeptVEQWggbQ{X@+t?`SEkM{|5e*S@Ny9YXj*^QRD#-TKExv-WMU
z3-HM&5h!RtV4ihp7#Wvte^1*3=`B1w<WNP~x+2`3pOsbI;v^Fv`2LZ*LxVB!^pnen
zbJ}NMq8i%X(Jn~y+u+36*$w687XFHUI*I~_hdsz5)Qh(hZChx&f`XxC13VOE?|-w&
zpUe9maB}r{701MSY_Dl6^aSE?4yxCFI4q=l=G|y?!X7@Sho<l~^7D3VHePv`h<p(C
zZ4<3mFQ_wJD8tVk^$lG-RVZ={UZmQ~moGu^P*WRDW}C?UfnxakTY5smd`R$c{V`Rg
znyb0o9dOIS!VVF);6f^rAYW##Jd^JkqH<jANQ#d7+R5siswpcgtER@Be1)7`ve85>
zrnJ-^@Q5WPC1GJ<Hr7ga@)gMu1_guGi@hwGm5#hBSK;700F9Ud=wk-2Rqg?pa2xjm
zkw3T%MJ$-;{9}P*f-o^i3rbttn3bVFD7=i7eM5PES!D~t$;qjkH}Lv?I}b>@k23FP
z!t?;Y45B_yq-kpe_x0=7<KxrQo5Mciz(oImSpSXsxnS$zg2$!c+J?~5Vgik>iBBu)
zJ`oYoy?ghVHN_tLLhRP>a$m&5^ZHbL1m~523yIIPMyB-`4tT~tdl%fGEt}R8Pr$8j
zYD#eVvRZL_BhRx=VheE?jGy}vsSj9IkhKaYJ;}k2{M{x%Tz`*z$+NNm_5O)YPELCI
z)P<DP#;CxsY%pjADlRS#B7L(i`(bEgWQF-;I|Hy@$(jJBR$gA7gTk*vU#-MKM@uVC
zI*G=4UU~DHYgM-G+C)SIdDVFnfwLCD%eHqAV3+^%K!HGi9;5)@GIS(9*}A305d(u&
zIx<E?B&#nZB*ds-u)4ZBKc8d4Y8RlT;`;jfxjFsR*OGL^+`@4k<ou4I5kV0VYAPy&
z{rxwuUKKRcm6avhg2aCtFfKgAeb&;kON+i*@;~2baT*d6Z-ZI5n3YqLcl&c-^dOUd
z|NcIDUnH)>A8uk|(u}IDt&NY5_YZJ)zfk&^iH(hoiOFcTHL{_hVJWv6HD^`Yl_cX;
z!0>*uxC~Vp?raVs$!{Na%}-sJu#}(z@>VAcs{k2mi)vrJRC(ljSb-g)QYx^o=e)T0
zU>h9Cd2m~|B+7^m30MqFOms}${?GJwNkL^`8$yowf@^JUEiIjy?t**?W~}}t8@y+>
z1;a&fU&u(SNf2Df%vV4B{<^mD%#Hx0O5l^`<9_6jvc}`~0sdf9%vSVUhdh72MoY^b
zFcdHa)t~eEPM}(Por)?tFi^|Dq+U==?Bbpc$Zi~revOpynkfP=Yzw+SVc+S)L%0K}
z{33A>JJBGh35M#gK?147cfLK$x6sZ`i$fmxK<lj*?BgShK#h?w=?G`%(T-kF9V980
z;bvzhN4UAWcb-=`IDohqy7;+-GuEL(B@^T0@0LvYHU?6bVuCX$M^1&bHxUQEQ2k=0
z3r_i2PGZNy2E>Q$;8AvjJ*nY*k<f?xmr{t!>DS|Q5405bK8}bJ*u!~X8M=NlyS&E^
zlydk(fgdDTh|QGmR|)9rKfl&zm0`mspfQ8~mVz>FdaWpGCg-%%feUk<%c5zqrV^Or
z;_8*)%gCCSh`v>XS31(hX-&yci1rG-M7lJ;a3Feh9&0h?Yd;kjY|j9~$y%YvDuhGs
zA$Na{mgtKY?ZEjXAt5nu$k=qyNa~Xa=mu6+-^c>kAsQ4BcsE%6Ef%cO(G*qdOn~e*
zmKGN;UCb=gA5T}#i(s_posD4ve-rEdsXX?C<Yx_P_hVal{?z!TQR4H{eDtYh@W*0)
zVJ-7$Z76b&7zQ1lg_!Bvoo8o;d0nowbWg(NQ-4MD0Ik|;LwUf?S6Hem{=&mEv?`Mz
zC`(`&e7}Kj8H*}4IQ=O3{k~jz0lPb_Zk~tEkKDQooL%79vQ_0OcWC#?Jm)47_kfAX
zE2F5ftt~0a?W?G;@TG0hzQwJAf`ZK<PNUxLo}P|8paXsV_AQlEx~;izKo&pY)96XI
zf6EfUgvaB3*g?_vJ7AnU_6}b2*7Cx>g9Crol0oto0sanr_9dy4<#YwFU;!*MB`L-o
zeH(WB@&;q-n;Tt73!Ca|_?-M7({xk~E)uG;3-??uS&Da{jH8#ejGsuk(*u=AyWrqp
zFC|(ol$hrq5k)L7gwNUIrPCR<ZcBGvDBS>OH2(oQl7{UTD;K#Am6;uSS~=~gO$i<M
zn6yww!<1um%f7CkjgtqHymKc(?e};HqNJPwhT$rYPIss)ko-2A6YM|&V-+RK?J5_3
zKE6Sc(dA?;pqfyJYw)oZ?5^1l%aW0jcDur)K{Wzy1LP|8XDxRZQ<>!!&JFM6@8l2M
z^LoZT8Z&;TDN6pi((5`utkT>a7QvXM#XNqPsbDQTywqSIP;1^bmU}&b`bE+Ghb%Ig
z1Po26$HV~DocUc`dRL4br>6$<9pc=;X_+Tws!V@F&6_%|o?Oa5+<?@N&}kk|BPQt<
z%BtadHU(cavD?4cNtwsGCS-Qe_~p&ChQhA5{!JPK8)P%y7Y0=`SB_oBtYG5|q4&)@
zzlt8W{3OniLWC5E_R+Hrs*zaj6XiS=v~5DPvCJ9l>n3~F3+May!ck@{I8>1$i~d?F
zmOq7#!$q*w(+?Iy%rX<EA7z(=Pi~ZyhenEhTkVmA@AySK<;*K1{paI9I;NFASo-AH
zV_zJBB<_KdrWt5n#aAdjoK-bb*}iM!OD(Y*AavlX>hf<O4ZMxQjB`lTYS-qftJACU
zo8*GkTDpcjOhr3IQF>Gv)#>=lM4X%lxRQi9wNE4m?kAVM;oQ6WgqgWsq&Mp2@B@d#
z))J`m&}LQi%HgD*%et$dblin!oaqwvNQlO$bW;tHtbQ_H7uOA@V?+B{PT8lE@0GX(
zSVbC18PaYj;v@Q+m5z8TEcC6;W<0?yE6H-b6DNk)g}ZzWgy&NWwqeuU=r~$!${=N>
zk7I_rz97KhkbmNb`_eGIT3QL0@_HAv@t2S%k!oC=$mz8=FoZ}nTvJ%*st!%IaO#_^
zXF>RJ@M&WiA(ll#BuY|w4UhC|EzzO(wC{lJox+%o2TL?w8kn8K!SjJK@4T9$s8GR0
z>nF3Lhs$9CX1Tl}-KYo7LY%nAE^qmA?9F@LEy*m>JCqHvq76)A72_(_yN=%XCb`Uy
z3c%<-5xwRAq;Y?9<(xeaT<fiSQy7Dh6Ul)u)BE3HYp@GrdR?QF-0yQACEYWNu3BLw
z^#)mG1Ql`bI!vEX)uV2Qky+{?-ol%nUA!z1yzS1r1-NFqI#atXF$=C!TxZi+|0J-t
z-_il!S8v{Z{Aka&5eBCTws+5j-I<n*Ot$q(VvwBUFm5u=U@Urd7d#USBYbfS_ei+*
zO4;MZq=m9%a$JGxPu!$Ojm&3>dAj$N@H@xLC`;a%!E4Ny0nj8MPo~9D$>5=#Ml>|6
z1)LyuwSySH43;8$(5dvdWczp+b$_;Vzh^CK@|d0acD-2@_F3hPLygHAytrhd2`{2x
zHRZY?UZV*XycFLz<BuFCyAB5ga3B!J*<-Qq>izfbRSFOtXm|QlMM+l`Iqs&oh^eL?
zX^_ll*(p2RibGzN$aGv8L2B?jZTf~3<rTbqbdUYkT?RcnyZHEbg)coCBcF^^e;?Xp
zj`g5Qs$mWSAK);^wKum5WJ;S_g)}q_@QgJ3kg+c<%rYdR$@`-GSKq3ST&jRYDWuU+
zHJB?8gg1pC#Et$H@_Pf#+r^@gkA#?&dDeG&MTJ?*w{b}H9iw0(Wg4R`h}2m<D+xx4
zwE;|N?s1>XyStQjcF%|gXTPT)i@ZglwFY@4+oqO&VANF^+dg;B&`v)l6nA{K@#?$P
z4ucpdy^=NSS>tC!)*R$<Mp;T%t|0qXiSK*q?2Xe8f9cO;H_t^jGz~=s71X^hE&^4!
zXOd%||A^e7w|C3=^JO+G&2Jt+M6?d8-HOCbjW$t~<OqPl>j0FitMftM!IWnI1j1G0
zFj3BQTe^{rP{q~!MF%Zw>&=3QI0Gx@KBW{}@lO{<9HZ4rrj7mPlaa$phHvVv?b{qh
zh^aRKvYVzLm6jX7ylsWAR%#0{gB<tUh9HBoBXb$)P@<>bNbXt`;?5t`-YV++5&H{}
z{JdF0Q@rt3ruG7dS?f*++O+8&u9R3aTMYjvr+A5E<5~JdW&Z^2B5=?ORY?|!)Gg{X
zJ?gT)mzqc3P>t59dD8LXwtX{+BloS-<&_Jjys|324!|2Xx4=-rVs!kojHvZmVnY1_
z!v+)l3Wu-%gF93gkh^1NX9uhYV~?eix-!(udK*YC{Wj!K!t}he#cp{#LlpCCsYP>+
zuI5cW_P?6P$%IG09~m3V*XoI6WE_CoTR$>Lj-Ajd<@X|=vtlQtI?B3w-A^7aCnu-%
zC>;L+c7~jWhUp~k<O1|py|Zawp^%nPVQ6gZ{psE4vB~&9Nut|$z`3tv6ZK$4BH^0a
z0BW^}9m+fz?=w}sw+F3%Hh2l<!Fmqw(k1)jLu758aEkj4Jz-(Kp#LznBdaTd8K0-|
zE+Ne6Wuorlym|9RGFJ&rA^Q-zqL!i1Bz>3GB~OwrP$h%Tx?a%I((?7|v1&Iwma?KZ
zHSQ2pIXgGE-LBq@j*bqffVkN|{<fwYbAfp*Ls%VTB3zBo6s2PKn5lJmrwAm5m?sIX
zm{*pe=LWtK6KSL;@7mUSiS2;GkM`EBTe^8+u;$j*(PDG`YoLWAr4U$Pbl-T3Og08F
zk&%&E^HA(bw8bXfyzE}*bz^pF>LM$Hh=qj(;BO|N<o|-43}FbSCw$!I3?YsEI~0nz
zne)*1>Y&76WOig@VM$6|Fd;1$GXK8{uhRG??T26FI7nN@Ydjj19R-C>)mtPE=kK&E
zk5+_R8Gy=&BNzdQ-``a4YAq;RFxb)beHohKn<h6GR9$x_FaM)1)YlK56n|G#MU`eZ
z(%akH#l;1XhQJ=wMdYKaUsG0C_-F?D_U&6x&Wk-462exG5LSHkfKiC;6rboQrmCCA
zefRENKE4qfZ@-#<<0$}QB^%2G2mNajodEr;-~7A`@7AlQqKF>N<I?QRm5;Y$5xU?$
zC{#KJ2OD!2vU$2uA3uJ4ab8YYx#8h_I}dp0V8aXy3?w8Z7+)nR3A^C>LC3Uk8TzUw
z<_Ykb0Ya42{8ot8|Ga^p;07Yx^fqSR=9RdTFIuq|8QXbA`|8CP-rb{st&n{{b}VkK
zt${Pl9o*|q4sO8TY_BXF3?NO9nm+d%-t;zJ_NC09b5Zqmjd0lB%%Ex%Y{0~ZT#*Fq
zGzL1kKovfnT9XczLDH)KK_oU`J@jqRPEAR9GutUL{%;%V{Vv6Tx!3^)_&Ch8g-6Uo
z3HW(@j?Ft}a=#CSylAP#b>i$o0XURz@p>w@OoIxy_#M0@@0G=FWXJ!D<{J4+bJdUI
ziS&I{c6_wofPf;MiZU~$U+5j6{E`5(2I%v@FuE9KZHd9$DL6EaduPQ=)u=HA@#g-#
z(4uNa9ANxvhk%mcYHCHxz(MMxtgLKYArGh$pLFVJK$W;0c5K-@nV7H~+O&M*1oiZ0
zI&fhoBqpZ1%hBbT6kY@_Bne4LZM_F#d%(*=N=9}*34-7Dr|5gS_5~{iQgzH1$kdOA
zwzHG7B%Aq8LVjk)d%fYod<K&|F70GPThHU9s17`T&aCde?P_P0fR>j>YW8QdY*@tQ
zgzxiAglT>;3h?8bF*|4dtK70FcX&sR$YqwuI;k~u9M~i(OW%#VEO#N6j{vuc{g0Y(
z463c?4V(3nY8zc!w~@$sD_`(QkoKy&mavxPT#k=OhQ<N7w`SWpXFLw#lqIr<hZb=U
zPO1k~y@Wf+&Rk8z=ioG>B%^%kKwA7Alk<hbBfTw_fc7DaUl6%n!&p+u^H}7#Lzq{P
zGiSxdHyvXFDvPxye9-|>I@jIjc1r6M7HgmMU_B#)&q8#*@X%Z!W+d<c%Gfy+<=fxz
zvrm9`S{nfb5T0uvINmJg8Pl_q-0cDjmN#N@!(<kbv!FkR%hpsVHZ2_9RDT8^&jLQx
zcyE`&{4c8IDVhKf;{rC<`Q~^wGu8D@h=5-O@P-o4eH*xzgYb1JXRVsP)r!j%uautR
z3S6r<wCoII%G3Yu7p;2b+AYplUfDZJ(X{`*Cadq@lh?mTMLhoc%63eAsD}_YK4OA`
zl;5DCMnjHcQ>$&hbR|wJ_qx^&M6%uAb)}b1&@dc)qCu0e<Ge!Qjxv?7h0Y*PO1phA
zF96Y85*Sh^r{`Z^0p@aKCADn#qR2Nn`}}UgC!q+ZUqr?y&Hk;1mC6nJpPe|Ex1`~o
zP(D73c;oiy`;0JWxk>PRWFK>g{iS<pOOlzAaUb+a^oG{OV(RChscC){+bSJeek>AF
zPHInuY-F~i5)%Q&`lY58rD>UR_dr2N|3mIOwfEi3*CC)%h2c@I;&w}}ZqZZW4653<
zkuMq~Cm`+5Cu0-I(%Z^RiMC{4m#NUCfGgx7=pCbu`TSDcTbh0(kla%M5E7?2(Wt-y
zhi6Dxb;=OtHBljyD7W#jzo%p972Btj7`QJ-pVPs20LIw1@+%YBoOFj-=WGP}y6wIz
zEs09O7$V)5{pmudZ|yHzd_taCzZ3E&`K&$yxq6lTE{FEcD*@{9y3e2eEKl@KBSqv-
z{+`*#naRlvbaZsAtft*J%z^j%A41h_yvcE>WA&_*%KGX#l&8yj6midpCV4!fd8U2$
z1{|`o5y{2jrsZB|_N5Biy80?6$xHms1%c@n|9hp!%7T)BP?HmORj0V8={%TIIb%O~
zhW{0d=OL?wS28bYd8bGmEIw-?_YBBZw?mN1xl;qBrNlZqW|RV>40Z5+NKaNKf1%Yp
zsl|N}xZGc8?m~?UtNG61;o;_HNe7g-`aANS%`O?yXV0c)XZ?(XxJH4|TIZMg{a*>J
z-lgV{z^mof5^8y`JU1z(7s4@o93^V7*%Gv~ey#^yk{IPqy5PE7dUx#Tn%VM}?Z?U)
z^igmgaiV_m8nqhBkhqYok|e%7`&4e=ik){<y@K=u`FK(V-oqKQKWP>@Dfa%Z9?Mw6
z<d{Tzn702I^b34*=y?Yd-6*dnt%-c!ryr=|Gr1TDfC2G9fNqJ-l8`(o`P#J{xj^Q#
z<>rR!PyiZ@p`l^3ZeG>v@79e?O(tqp1~9#b09p{L0VHb*VWaAEH!c4?@9VKoF~h&>
z>7kr2y_0ij97Mttf0YNkg-D!2_KJto<2Qgzb$m_NQlA>Hc|0pES2Bh89kOe*OE-<#
z${VV0`y)4oF|Fdw<7U1!ng9`e43=;N>!0hqwP5TzKDxeMG)I)WwhBo_J+&n-;Ai@+
zTKx(2jpV;*b5WyUj5)9&ULox{g+=@DF!{}!#qY+Sk{u3%Zc}M#X&amENa!f=v$kZi
z0|U)(U={faboaU8h5dc~@pGEar9VHj87W-^UpzV*%_M|w10tjE%`MB_88^@C_7s)W
zcaGJ~nua_+Q|nqPg(!6sy0PGDaSqt1T(%fsa)8n7`p799xEw{aHZC9V`w{F?aQ$e2
zA@tKV3z>a>39vol|3s^CjnyddIYY&)%+%D_dE5U5SoJSe(4|C2M`!gdBAt(&lBUtW
zMM|K4?k^;AgxAP)(e&#?)Qw#vqGXp7s>9vazpiVdAHpn!-zCI++2a<~&nFH|J%`R?
zA@^~#<qy3NeXCFmS5Imp`2I>X$0Lp8RC@@J!M4<KwcXGZhT5C<PIh)B3TZ6N%#sok
zanc?rOamXloxZ+_5~csY^aV7oHV0`4i)tcir#8N@urL4-l9b$Mpj(imjg`o?xR~t5
z@9&@Izc@Vn5G}SB|HN5fJ>=j9B~Lz)je`EW_Ek~(O}l6RP#AC0!`UZ3(+g^73py3n
zOZczumZt2n`VnqG9-*6i-zk41{lFrN??77`;MIY&2)@LKYbg_agS?3%Z|&Lp#Ystc
zo*6Qxwl-Qdn5i3xRNB|BU0bz8>T?Li<z{5~>_q}n85DoAo$c*UZs-A_3LKZ^;Sx~J
zgsBJbQAA2Z(VGT2B#b~u$XZxfNbZWc@ez5;K^5~zM2G&^S+^)jdDCGB$!jqW^}%D<
zCfo0tt=b-AzOMdP5*qBtxxs|0OkXAk<XgAEN$1q{?ORJLu~m;HSK1F-Q)I_IwzI93
z4!pVlmD6cbtCyo}i|-J!In*ZEn)E+O=S4rlU<hr)#`^wdwv?2;YU7m`$;cSmjEoHC
zG=emtLuoOwPXoqu&P+^9%5)SeNvWy#n3<U&kh1*zuPoJfxVpeZr{<QoFc0^G>^e=*
zsc*=1+i+kiSw~vmD~2GH!?{<g`>44)l_C>3k}>wFQ}QpMsNS?D?4(WdA&&K)KJf<%
z!?|1$|LD+r)Ub<$AU#i*L~*VKyS+kWzay($n?G8Lc8#Q{+;Dz|?cCp^WTA7K&v|>%
zg4gfxDkk1p0MY;DCYl-cuNq5plh_&%cYqY4vJGTsYVe6wp0D^BaiAT{n2KNjcts0L
zu7F3p%iJQ`bNE1Ju{rDE*H%y%*rn|DvZlL@Vw~$Gir8V_*n#cu&WHNy-DN*k2E8Gz
z2jkTiD@rSwbq{Bf%bm`J5NR*ud~?yuhVQ5>Ym{=Ag8T1z_v%ZCJ?SE)z*zl-I4?A4
zhXcu}3<+LDY{v5YkM3%%sP^TwesLy4jk&W-up<gV&MMKWmajMApnZdO`wK^3fB!}J
z<o81I?dpM1JD<-{8yU-+G=YM%s%())CQ1DD8kD7rk4Trm&9=~ppnqA>PU)mT1c(DU
z1!S0JCfmWSikwm&v`T9S2T!n3`YT@AXRfA|GMU2`g$S!|GgIb*`%}P?#-BIrH!NTi
z1l4~$(r%cYWHi=U2QWogBOuTuhqnA2EULd=>CC_VCGV!Og!MeXJ~)=cSayfLR=;$p
zZ;WB&ptGHZfkA(<Hv`msK=Raq>G+7qsi~0zuSBZ~9Ofh`*$fOuV%*j|3Wo#&a6kEM
z{-zic?kzV<rA@Vkxuv6nVXoAU)acdrvAsOgT+*3;6fD)Yz?f1EG?1s(!`DI31w3zg
z2H7Z|elOYon~g$Tw1VxygD;V^7q%e4-m1k2`+0n1+b9%rwGtgOI@=zi`J9)Fm*esF
zcDM%-`XJ8Xi(jwdiJYYB@~1802?^*b&0gYFk6ubNN*@Li-N}vpG9PLw2PY`rC-o^z
zX#Nxube`9$<T0=k@0NI3rSu_^Dg$oLvyU*{?u5MaRcyawhTp}9;Yf?low5xSe@Dj5
z2dl@>V|*(3?~zx{Fmz*;5R{R?$u8J!j~rNncUNWE$X<+C?yL{JHG|t7rOz+Ce{OMn
z77Od$%~N>3s^`F0@x8spgvxa@Y~~`+pNJ1W3u-R$3;%m<RDF?Kj_j*^?3^?6q^6_t
z(NNI*+e}EcS!}83N+mYdkI?@*S#INH&OmvA$c=f{%YV2*=H~bZ5l-y=)}_Z5HBa_G
zmCnYLnZ{-6T^q{5#(KYW3itxIzyN5i#m?rJC}$~mP{IMGN+pUB<Yg<$WwS+3!lT`8
z#X`$RHCtF%*Kkh77hu~4%l~VA|M)erTO(BD+}4>}ouWyV=((n>K}eQokMmsSkgnSW
zdgjvWnZT*gc>3D?=M=GJo_YR1*djq`hYj?hC5jPn8kZp!;-42wXbuvmp(E~&e3<dB
zr{3Fbn5fMa_~Zp}8D`~m;}hPPlA8{Pnp)7>8ujd1vwu6_Vx8i0U<0`vfZ*Ur>Pa;+
zth|i?WXsfSR!A?Viosm%=flz=NS4#PsqP!IY?SDArX0SK?lVBwT`UY8GL`mKW!N!0
zkM*AV6i~%f_L#5<JX7R-l(+VjfccX2nVizDhKfpSp~*l>il2mj{Ynb#X|>n&>r^2@
z6(x>#%qE9=0!=I|BXCPS-(`lV9GNX{U#5HaK6`q4dU=s;39FF+_4)qKj+R+hnk0UK
z9jUzz`v-ll+L$NDI`!nh=z<}TZfhMyE-YvP3vfe=dPvw_`bYPra126y`BLUmthHn!
zU8Je7tgPen=e{2c-ADO=hvKuB*ZJ!%uCC8ZTA~&HMCi>ghi?rjEfX0SjX$NYdx3#>
zIB1FsKA`)dTamU$B!`D^#Ck}`%aq2s=S<*KB@Px=+u7gTPaK#N)UWm;J1uo#xwx1C
zcu2W=lBxuQV|XlQcn5oXJ9=%NJXPh|jr{hFG;hH8{!RVJ^>yGJy{QNf1YV1vAX)}q
zo28ovzBGdtX*XH{f_2n?;pgL249?u%=#Z&MA|=-QQ#L(}zbO}V<6WqqhUz=|RJ}8^
z+f$WY7=rY){bJ>1DLaH!3$wJWQ=|R;U%Y(zX76dW`EWxZeS@HqlG3x<t>bCs|6<_Z
zFs|R_bwECsmk(0&f`js_+IF9UpE(&}_;zM5FA*^cvy@#r`-u|kNAc-s1xNzS$@i~A
zwKpSDW3Dv?0QEs+5en#r3sS2~446_W;LDdckHY5>OG`^WWF8{mvwA1MRafMvK1Zfn
zNp4Qg%cL$)m@wOau6nIhZfg!~!(%R7NBPunIRyohYu7eAM5(w1nh1e}0NhM($kFRd
zYnC^f{GxJ6yW<E#zmgYS*-yvD`g!S8<^m2Ms<O|NRuuNo+Qz??#hLkAk&uqVx~d#2
zEKJy{M82^|%C!iuZRwETmf13}h+bNnePr7F+6`=YQuL2cf<nL=!uim?qO!bPEv<)y
zgd{N`;bl_I4gd+lpyO3k6nvzKF3>X~wnLwQieIu-Tw0nH^b=zk^-tfq*cll;!P2)F
zKT@qctDM>P*2(|q+D~w<WWl<v?v2%_5c)>teoMiL#lgByc{;iPD~HCJg$7R}`I^L;
z1)TGRO#7#;21$#hehS{D7#H{Zz(Z;4tsGIxJ-0^bk~oZsbZ6lX4nBTafeI%}{D1tL
zzcA|8pdjGCiU<ZB0qaG72dam71?&*$ilnr(Zio&{G^s0+-$^B1xUte{AGjxW292`#
zcE?<&CNtH`I>EoLu8ob21Y}zd#)PB21G!8V>)}u83~e^y60qtsk**ZL!rD0<%P^)G
zAGL`aoL?UoW|fTEVA2TBZR()LNLZ#p_I|43i{!qqF`jd;ol&3^78wDySuTJ=^^lc9
zo}L1V_$&~6>5QnScv^2syhUL;J?*K<hiQs>fYQ~peN#UZ!8Zhsr9@V=qQ(&kJO860
zK){~3R!JV$EWCgd6A`_A6yAVrY!nZE?$Mi)nu?2u*VNhyUYCwnc6|%S&CAS|d7;Q0
zZBXEX4ijKV>NIOr0(LFj%t`3Y!miEupGY6fIZ!&Cz$U?Wr4~c~%eOpKtoHkcQK^U@
zUGr=QqNEY01K=+9knGnUxA&<Dyi3smIwajL4*vEI!8@rw!yN*v!o0%zo&%CP2M<a5
z>-7i#=0_9l1)ty_TE~~HOyV5Q{IpL=+TwJYXP>kYEZQ$gY{jlqaQ=e4<BB4$SP&te
z@(%~*#0!tRQ0;sf=iDaq1$cD8qAm7TjP~1pfJw?YPxL5unKV%ZffiZ!(TfY&395f%
zy!$fMn{sEp2T@{K^H)F+aKQtyal|2Qvt17)sY{iy5=9!e_j!0Zlt!h`{xzH4B!ttH
z`p!S&l#s76mhPi3+i#6tEt}*=+6i^B>nerH+*+EC4fq7gsKSl0==rr1R>sh#cHzD}
zM}JSu`c@LfqETiN#h6e}Ko1osqrF?vBX4rGw2>jC{pbfSd`y%vceX0K+4L%Ua*$1}
zSl>A_`szU|7%&6$7=VF&C!Y!l&7X$B6HxGOfjv?1C{s1Q16axe0s_`xxarYxFcO<b
z=iuygK+3ugkk2*%XUf<4a*B`<*>i<c#cmkH10}}#e)&`${5Qe|UlmcCx@9TFhtJuK
zvsPnwc132bR|83uoK;c@pI-XNNNVVKQL(A8^GZ|$VM3R%?5h_4Ch5}I2ljV3Mejx3
zlf`F>f^BfQR`69M(u`7f%}=CPMP>eBkJ+GcHLhWWw2xX?A5)1qAuv!`W)%`=Vss8J
z#7-ONN&&GscmBM|NNEA6xDqFGnzPb7^g-$a<Fope7yGjOJUvU^jh|&ql18K=$7gq9
zKm-7S<Xw9DwkW=E$Ki52kezI1Z04gj+Jk)Wfa!DXA$m=Iy{hV^TkYyad>qX7Q!6xJ
zU_jNMj`7=gr#&_4HOwkW`~9#&aj)y07bC&0ZN(l6(+FE%_2Rha;f$Ndo$r5sX0=aV
zIkEdvG0FfmKFE_iw<#B@!4Ae}G58Xori|tFl5`1Gp->j`IlcpYf#JfxbuEzw2{p#W
z#-7`cTiikq4yvG+zi}FY!g)>^Ol=UCl9I<y($gI>PI2F0pr@nD&AUmCs90}gn)+s?
zY4b<#5;p1Vuiho#(YbGYS4+eN<XMxyy=J-lfT40dbjp9{k|m(7RTsi};9Uo7gX&Z(
z+KW-I%DuTAH>zFi=d5?#KOI0TSj?#IxresAQGfeV;94PrcaQdv!Gn68tff7lF|LOm
z!lfe)3-`B0!AJ1^Fi=M<FE?Z;03^?hT4!K2dKlJiSkP4{JUc?gYo+u31-yn~*tz&%
zE=5QaM$14*h_3<bGZAaY#>zg8<^RIhy!{Vf)1_Qe0pH^N3up7-3OoPhszD}Tf^;4Z
z)L1asbm{Z`>PKsVSfFr4zP!`84r#^602nZouusvr*DJB_8-YF)Fk~j)d_6*f^#tn_
zY(CAIkldybvvnNlb__vy>`{<8=^7O?=S^KS{$kkA)DfiS4G2rFiz#J9{l2f&dz^!!
z()DT~x|5z*eLz`2uvC?K+_aq&ui^t=8DJJ9+#WiMb^ot#qV?-_`1b~@%q1tU>(ai*
zDUCyV*=1fAChwOB?Ld;m-KYj@df%+E&tyCPWKQ;wpJXdv2Ql~hHF1Jcf+M$UDkFUd
z_KH*(QbQKDj&GFPdh`5#mLDh>xSlA(aAL?au`rl|9bR)qqTTf!CD!TZ{4M?9-V<};
zCh=I!2K>nAj4~kn#jG#m6^)Zcue2R;_YaIijzznxv(s{>8XcYQ?l#1ex$;H2nB6+y
zRL;++T1i_Psv4!tVd<eg3pV;Ght6;h>P2m@iJQj-`Z`uIA2?ZAD)!iiQ((YtZHP+!
zMQb7AZMEC<aL%y0mrnCb;0<G?t?$<2JWuE>!vwCGI-Yz-*W}z8QAZ_@jE6q~q5<IG
z1KB=kB%1f(8coUIi|I+FxuaW-Hss;XPQuhnh#)F-=#WL4$I2+oW39@6n%g+ceJ^I5
zyt&^P56kZ~`UQIqbicO&Ti*t5l#P>FdZuq>uq8)*orO=(=cJGN?JYYEL8`6Wp}y!A
zpNgoq;<UJ(X1(BW`8`DDIstx?f%yf8(hIEW<>0KRkYjEab=Hzj@)ETa9ufD(m(V)9
zZcM2GeCFhRM~zJ9n@bMXL)!e|veNm`RI`Vi^IL79Efg1ODw1TP&5X;x_qC;EjF`&&
z$8NuR1un!5TM2gpU9UN_LV|k-%3TJJ{e~&dVLiq^1rG0@cqt^gTwYK>L6bH<K$YXg
z+x7K`WJ_u)rhELL?Ieo-kc)A4@Pzi?uBU+~rw<pt8-MXp-NM4gVE#x8ar9I`+~{A7
zE~^Y+M-9pIaUPjYy}s~c`E>03Wu(ZIvGH@-G4DGmn{$zS18o+XFm5;T;Ex|~fEx(5
zUAdWr{?#7vO1^^P#tqt%j2NgNNMI8aZw1}P#$L+;3r9~+k3<M-3;5UEOs?)B`jI0C
zSnEMARE7fC)vIewnnwW#6iJ25D*!q_T$TK_ILPC;pAT^?ErzQ@6b`Ng<6uG0pQ>;E
z$`JT79TOl5kWM=?^;M7SP!~gg^@h2-kI2fdD*e-i4>r)IAuy^_Q&V?zbmW`yL}nHi
z+BDw?J_pX(s`g1>>wk#DZZmlHhx+MN0R%w#+3rB%zEoSX*Cl|sFntetq7Vf)t}pK$
zqL4;Li?0`x?s9OTQ($k#9SjV(OG*|WvPTyrVJMs5*!H!qas<OXzBWff6w`WgiJyD^
zlktxU+N0l7Na~v3LW9c8Ua-?+d)uZK^*9!|%f>$<-zc?I@jCZz-k|h$pFZ;=@-*7}
z4_)zN4@1=ZeYXFMaoT7)u+VUGYcwjyMS*_7CCv&RReKAaAn^4SV4ns<MT{SH`2iOL
z85e=i*$>E=@kjr+oqx}t1P>P%%p0&-pK7q2ZT&(-PyYvdf3}vyB;hCWaKZ5T@s%@J
zYUHQ(Hx`@*Op`KwueO*68VsbuKTAmL{Hn`A-7Lc1qi2zXad3slrQO{J^t=+e6fjt*
zsE}Io0Q%ot4L}4dD-QKeorhudjHEs@tK&6O=~Giv0PX+$`4fzev1bnq2uMa!_W)67
znyEd4+j4MlFq!O52%i(0gS)@bL=}vxsE~PM^-DcTi!iI$L`M;3s8HU0ec{Rcsi1cL
z5B>yHwXbSa)mYR^MJ94nUP^+`jx#$<`-<lxW6U{ziA+hfpfCBr#K=es29i4+Ev?&E
z4i1j?f>*#l1=N7wlq^SlV5pDtR*&L5=U#yRHNt9*6Is#Mq*ivLypV?y?0vV>#q4Wh
zHGJNyv=hH*{afsbxgQ}@TA&>V(EvOg{?UOO2!H?YTZ?i^<HRz&SFSz~viPTOCA39%
zy(l*~K&LatIyE&FbX4T#<pIz6z#V1_cHq8h8U|kuoZGU8J<_2!?xUT1uip0dGJX}x
z`_e82<BxT$F=@>5+&T^cUA+sKE-28@G*E?R#?(L_nZe>R&TqGu?ys8zC(C+K<_d*F
zgjaV8>?A6|1jiY+MfE-bLrb^!4{#Cm?(QyV3z0AqB6#}rX?>AVU#3oq<y|y|k4TMz
z1L)ZPkAq1J!@`IY+-lH-Q8N=_eLh{U{lUPDO>HTIfW&3S`Y0Uos%9_9%j;PE^ShTN
zKQK-G|Lm2+yYUAZJ%Q<41bj@t3tJD=s1j^uU2AxTdGRm35TTJy_Zl3qCxWMU#UG5G
zb^~1NA?FvkY_*Loen|vq(hO**0)GJ5l?qKqR8&;NyVjIu6nphK>gwvu&CQq6!Q2x>
zZ7rCM`r)fwl2#9Dbz^^Zi1vo#azUSNYs4NP62N9!lOaFe2W%MV*cQP7kJ|_WpfDf|
zPL}%*R`AIm0}G;O?vq?2_4%2e&RjTpPe34AWZHinNYzzGbIb+=d#4>M9T{2Y@Upg-
zhk(DHL99S)AamiTT9b5%K-79W==TIgBzR2E*D9$O$DnEym+$^b>0scl4x$VPVP3fP
zxd#_C!fby|y~+lexc+nYm%k)32fGRTeJ+SW7zKmprr3!+1Cu=d%i&_0@O;mLJ)Q90
zAmt8d^|8M1k7*R!MP$a}sC$f3<m7^$3wTKmapFhG)hIE3g8ZV*DDK{_7mx|hN8tZs
z|Fs#9+)KVGkkokxk5HrrY|A^RZ=7=Fk8mB}`hm$rQGg*^eE=i7`;Pv9q`h@m)N2<t
zjG!PbE#QEZsFX^#ib_dJcS?5*IS5KBARr*pBGTOrQo=|`cQ?q;9q&EhIS-yEuJ`>u
z|MXJ$-M#kOYp<POIse3AmkdMnEN;KR=YnEPwEOVQr6s4b3Um}O67s)BA~1s3d&au{
zmck`^#Vor7f>d$fiHXHd2-@V<Kwp{BZo7X&to7kpXmSj6I~aCYopS;x%LNyT_}ce$
zdpFFTPeNvRjOOc6QP#lg^7mXdwIG~)eGXZiYMEr&>-x0~^Md@#2*DxoSh*XYtKRYm
zq1>F)yRwcMKkw0@pRT+4;P3ccuEaLW5%JK7-|VeMo0VK8D4vn`bi1*#v|y2r5K^l{
z;Q#ONoB(@lkyo>sRS22tPSP(k0rxThofj(Z$*n*H>Ztj1yrV=*k73hD%44$5ScM8@
z1^}<WcO{6fr-_8Dx<Y+Td?<vD9G?G}vv<$0KpbJ72_MK(rpQU>UI1{FO0Tz767x%7
z&&WmKOTel}gam0G-*XaNM~76mW0PhdRBIppTHkbr4B=LwIN&!p==7J`?qSU<(wxax
z-l^tX8oq+$5XM23|6#V~=}1*Y=ndndr9*`F#?dCz_c}3TQ9dY^my^wRS|0&=X8NPs
z+%Zf1fV~IhmeV&R(gSl4`?5ODTAIf?E58b6S1C}a0P7wAn6_R}aD7%+qTSrH>1-$?
zk_GC9fB>ayA({y>i}{ZrIx!<&p`P{Yc?cbGGG{udbXWD3@P}2-r%4CW7h>pi%FQ_(
zM~AKG8uFE2i=2>kyeMfhBitSP?U+<~@QuHy7ucu!Fsm<deDV8WPdzLgK@3N}j$ufE
z!AM`Jq6~eU63uHQy|>ssPf5wPu)Wy|AJ5NndUn4zC1p)qB7p5qONi5y3eo`L-#BbA
z=;v6w)G)(&7z~qZ0JV1i;z79>DLXu8R`bnytU?pmNe=dpKyK_JN%5Bd{|=D_GeRRY
zqUy?}Ks9J;BB{w=`1l+ff-3RmDNr`$azM1dzih$9K~bZBZ$w?o68eVXe^+Df6bsQ}
zVq!*kgB+WBwOd*JFyZgYz2^Oy<6je$u0k`BI<Hw!Vle-)c*H++l$&)ts99nrmSy@?
z)(vs7C&|-C#)r2j+fVv7SNNFq@y0826R7^X_92wX$8go)<=cf45Qo+yP#6y(fpf6H
zTMJu@HS5^(Kb#A*vXVfCv(4w(BGlGS8~B_v?_Y<wivm{&f5vp?zg-Y;3Z;RtAdu-@
z#yBX#|HA0?&0@G1C<pl}Sweo_@CK7^%gOz_Znwp%j^HH1Kisy2zy<W`@b!gD@<J$G
zkG~_gu4jE2GdL=3F>_>W+$GMv5UYTzP%`4H&mv94b-%y$51>wFF%c09UwFV#yzqkt
ziUSk%2jtuJv!sNo&(e9(s>8~FHqsztMFS|ohRWC(d7Wnj-(^W}119qSTd}G28|JJp
zU$t`j&uj<gwB0n$^&AiM%IZkv+0Z5~-(yh(MupjU=%M<H`^AUfihWLk#ujDT@HB{N
zp-_!TpcJFab|Rd4O4?)ewi1jrC;AiO1J<9a2X#u)Xpu!L-r;Up$yVQN>tA<;6-0AY
z%-=zOG!b3O4rV_ioRcN{XXdiuKind;r6Y|M&tsLbPU8(G3=E9wBimL`5I_;QZPr(*
zqzwI>XVc*)C-9f41Nz&qEoai}{|7P4?WRx$NkI7k6u|Wv)An2mC+f(|Js=UJzSk8%
ztye~hGJW~)Jsa9SSV=T0rCB_l!K&!)YSYhad+kgA(L|K`%xeDflywceVI!OI1IS5)
z{C`qSs1p|By^X)`n=JnFkzgi;$7DEoGIA1RZ95&-*uK1vWe}@o@9Ax3H`(e0#W3Z{
z<?Et9wbS!eb`05m$1uYCc!u>pM=>$Nd?Sdf1LyVKB^GdLDR;Lo)WF9;mM*D(84iaR
zsagC-6wcsRD*j*63ByiTfF27D^v@}ZI$Q{3ln35%0_GQNDagoRDMTOh63PyMxMy&{
zfLd(4+9MHeUsRE1mtgeDZL7vmqEk<5tt-t@b7sUNj$&akK9<)?qe4Qf_?FMX)5-_q
z&tlts;M!Kg?ABS->-@zw|4EW@&WYTf{NJ1i$C|HZU@|=KUB0exl)Zj#O7}AyyycFE
zu9Dg>s#>9;9a~o^D)p5uUuK*!0PQs*FMqlmR}KyRI{L;S8JCs8uX{+VAHVeWDZYOR
zrx!@SeZLL<Ffr--2XcN&4C(!+6~=N(>W9HHJ0n}$2Nlel{Cu&iTZ`R#EE~FUexP#g
zOhMV~V2r5TX>C7N$-YwSM0Z$7EC%x8y&*O?HHA!19IakhnUy4rSozdwe*Ic7xv`(M
zeNyr~YCESSP}|BxN4nPH(Ik1uw2QjHoBz$tbbwq&(NL8=OZXqE4ldDGhFVl`pL1E8
zK}{Ga33}Std?X7=*L*foHZqo0{ZqwbV@4HL47sBUvouGNzLIL{>Z((eO1=lN!0)19
z@cD_7l2ZQmi$ajPQv{hlM@L6coS^>ZspZ!?9i6CX3lQN&4t!!WI<E-V2%Mg;R#o1D
zZbXLb;5(q)0EIbcyJLnzEHe00%|3k4GtSdL5$1{gx_4K4J9epm(*k<fJib$C-=UHe
z2D_6zwcO|4R+sm}%Eh*C&tY#$yF-(2>z7l>l$YCaJ2~}sLt5|;`xQgTDhTNUjJgDa
zov+ALRC6mSi)zDpKRs*y%@r~KxWE-9llo|+0IG<C-~sjNfFelP*{AjB$n@7HzXiFG
z2jO(!5iDM&-ko`KhvtjPBU&R#v7B3Dgio2e+i^53r@1*yk<2)FN6u5=w(sLk-lz#C
zR~<_{s$<>A1Iay$UwR0IaRbw?MJ4dSD`g#M&rFBVX-7n?MwV)pYyPi{$mKx$8hs85
zz&!7T$6^8|0~|7`Tn+43>%eB{3}o8k(sySxyQ)`+;`mP9t*&;-Pg?##V!JZC%xRN8
z{&4rzoGAAdwpa=8x4i{!UbmLCG;4*lgiNcqY?h1ImC;?$zbl7X#tAqyJ|9nmIXdjQ
z!9?cMf^t7x&B(~Ec0c`QIL$!umSeVEM^gA{y@!{63i=cDPx!^1+B4ks9z*(*U*2ek
z9c5>v@GT^Zc~7{eU@2t`mNFJHTAWrobrTi1rN~U3tm7W{&hQOkc-#xv)+-#I4v{n#
zIBLJH+Y){yk#7|XsjM_OQ~Ri@S}7{HJs0j``N7cKGw>7GIQqhlm%o*lFF!H`f&xGt
zmMdjMtR>O!0~-M@Kn&!DEB$-nI53OJ6wJSjNCiPR8Yn|7$X;+ivCRUC93<}qFG9Ia
zx5j@43quTV%3<8sW2H=2V1`@m_qu^e(5|ujnv~tP<bOn>Rmeg~Lf^I5YUB>7QqoRW
zkv4p`H>A#zg^PQ-m^y3xl?Ka0m3-6{Zd*4bCG$fspm|474%c`1{jlC(=WW=7Wygh$
zds&khJ8@=|WLx~q{4yXr)m*wIex?R%-{pS?lm%Rbch!;f5;$?fpM~@L`jD0_wKV_F
zmM!%|y|~jyKlsN-&sD$EQ##;5!UGjcXB#CShaTK=!xXXw6SK=CmG*ZvtDZ*MJ7Y)@
ztI$xI1gAMqYQLEiCS~W;+j1LrpDUWy3wTp#yT2(pgpq>r1ieeJn7Q8VW8ml5sDtm%
z;bvWS$5v5<w&NTwe1dPiJH*NurL4HM_KyhDM1o92ID|t?6|COOpKL1hM=?c|o2~$M
zPxny1WsC@EzJd<)<|~hPfbCNh!9POW&g~cfqbj4D9aLo~h^&hq5)u&&s^XkH)08fy
z>|6a^8Ip7riHG$&e1ZN)r6ppX`q52%l%&6o3YdX|zHeF}$(zvp1Ikfp|KVsB@%X8r
zw~l!KY|`AEUf2L18#qjd{4Rq}zmsIgnb#<A7N#?8Y(G7MV@VTN3dqSAI8274<O%ke
zj08sJkkzms6?SsGy>{yR{u#fJ#WJ6rH^Xs=t&B0o9&Xx~Uq_952eIrcr3ELp+M-tP
zPh4nqF>X!SaGn<nlq${oxu*fKc@QOF&Bh`i$jZe$W_TCV^}uHlP-puyp%n;il5Spx
z7tM8iI?-m!ehdb*1MbtaKz<&deo11_h6$GqV#ws5Dw6jBl8#VP!+da~`*9l@x2g+9
zIxKU?Q+wRjA^fI$@KOa2MMl=uYwULvD%M0I7OV{Ncsb;oHoRs`*}G}oQx3!{8Rhko
z%yoJl@82jDkILAG;9vf?K)-crugnUz(N_V|vU#Dv+7UeXEYKPhS;F6E%H~RqNMgEI
z0Orz*%b{K`YbP0#<OZL+nnyV|Ga7#ePyr2rLu$ZGMPO?5OyjEo-4EZ&RLnwx<G_{q
zPVHu4?-AP5!55DsMjK=@^$+7ZIkh8Trpz*?0zlpY*b5Y7PX0m)2Y6`s|Bf0AsOLLA
zRnP8Iht=5bVxt&>nfq&+D3qpWmmPpU=H!;rkpdg)ZsOD|329}YUzV@DDt!()Hpe&P
z2JX9`sGA@B@I5i`+)WfTat^s-*4F?F`sGUN4v8Ypq+k%e6g^=&;eVUpqD7wE?Fi?{
zJWoCpLV$&l*f)q7&Hq{KbDZtygEbu30}4{7$8D&ZZ=Y0~vY4&%*)_uZ;qv*ilC)Qs
z5<9O*Dec}YA=~)W_@-wz>19`PT%RTUov<6tW3?K`>M$45jIor-<Rq=IO4S5hO=UxU
z%#R;jr&Q=fK=>;gbcL&~x0XomAGImOQj;h3IV&R_LbUHOP(TIwZP0XFSv11)N!wdP
zUP**r6d&V6aY9^9MJ1a1<&1LXUF`XqU0D-aV?$yBK_xO7)zxH%f&AFsJdG!}#Int&
z<8tHaX5)AqE0{JiTC{zcxAQMA7Z_kH7mR?#xZrq{WPj~D+N-L+eqpw!{eJ48TW)ei
zTG&gaxa&*Ya?50HR=3%20xUQw-&-=HzM(krZ34X6^a#EPrR4eH)whVa3dcFD3OwPG
zyb1Nz+edhdm?fuX=}uxqFyS@)GOJI?B8%pwM^Y`yRav82)mkuiR4-C9V+DfV=SO4@
zOzke0eHPbs#bvxEe^l_Sk})vHjmtVK-U4^7^)ETsTV$!Kw#E@O$gJPAcXzyIW`AD!
z{p%2ld{<<i$I!rBT72KeHs8HF@jpw)!2Lya71jBh$m<e;)9Ha>BlsvAiU!yde^VyE
zi9Ep3Gt%kX0T>W#ITyn_Iin9C*5F`=#mL9|^2qucP!mc|)^V;X)7*U{+{<?7&K*`(
zQiZE{c(;JHkgTTW-|DgF6`vyfS`1vK1G!Dj%m8A4^5hAix_`O%LCy=DZ76|auqxk9
z*aOYs@&<|y$#Y2(pxVuNeeV6_c#cwHLqn00m6R0WkocOoh>y>uOt*=Bn-L1i)avE-
zxV+XMfW|lNesmkCbnLss##S<Tz;gfrB@6w_oD^-?Y?Tj%sCcXsfc0T!ra5S|gJ&f<
za{^Wz6ryn_eHq#Vh!)(v3!(-ymud<B^B3^iQ89SzB2GtzPi#u7j<I-QQ~nsVv@gYw
zMu6ZE)CT&#M|Iiv4<OF~b^P?H_aD`}1O)x3kw%M9v6hiCyCO9f{*Vt>+aE+dH!b?q
zLxTbGQJ$WjN=dd{fHe;b>mC>olc9%P#l_{kj#C^ShISsVu8O+O7dbpdcv#O6GdVjr
zxS1Y}B5w@kOBZ;rzZUUKg?<j*C@9&CIUd=~z3K09d(;Xl8n|t2<mfm-T~WNx8Xy9|
z>VeqcUUb`*KB(kq3Zzsd1w9$nt6Us_Lme<(<DdX;<%0tQsojg;wOvRB8+Go1ZW%m`
zjH1?$K!1ek!E-y5QYNo=0zeU>aWoZ%l!JgT+rP*uPC$Vt!q3;)B8VY}&qxhU%w)@R
z<>v4ztH5$JWbDU&jHk|CWTEb8x-^|m!~GLjMhnwmq{y6Fd_fAg|99B8&1UY<dsg=g
zT#tYTTA}Zvp&*O4=K_g6->An?v&Z}3nv?CE<)tN{fPQrD-WEhjH=rg7xn~033>E+y
zEabOGUHo>yxH?@)Wn|1|z(7+T%7)3_JbrQudi9vah#&vNxyKzz1_O^XP%yo@QEX&r
zk(TZ<?07X88+0l|)(WFms{poT#!6kCa+Cl5{rh0XzkEqd^ki*6k4A_7J?fI@333H4
z+bznOPjdQ^?dzl^^KgJ*44j8JWye*v5s#1VPr3|I{$?(K3D{pgMm}282cMRcn-OcX
zc!~4&n!paoe_jolKL49-!F==9_P2SHs)Gt-2@3WsFwojp2PMQa*#GWL*rNTCVGf$A
zECCWkQc}1$I3q=7%0O$VjT6Xy9vKqxfIM=1<@PENLu}E7vZCQ*T-cUC12GMuSS^{j
zl=H9+lmKnmBvd8u{>Y2-Q9H;wle0N^?LPUxBAEZ108F7iBiSW->B=|n6TU8u-SyWn
zJN>Gj?t@R&TTwO@3B$)4FXlb<hK1V@L5FVhxXGtCV`+pvZ`dnYBCG_3mt4<iT;ZAx
zaOKU;j^|(@4corXy#=<ObeuDvVZO6<a;kffDlO{&XfXv6Z#2E%!+~v}Dki58b4`a6
zQawBY74QWob`vUdIvYU!QphEKBo_4{`h`XF20~V$$<13V&L7M1h5pS8>G`L7^&9Sv
z(Q&MI{jVQw%Zj8goEW7I5@ggX{M1qQZS9YD%*-i^b9{xbC@Bw90~MS81*<n9lt3M{
zp*5yrNp>Ek{I0s8#M#YOkGw(5;k?{egAC+8>j3q9r$DoLGD(}=^J(L~edy)h%hXr6
zUwj@Xc(U9{F9d}7cI|{L=VZ0q2L;N0lBy1&_IS?Y8Z8YHDhg(Rp{(zIjJS(@tufFr
z|7)2Z?hBesHBk82Ei00YDICLiUZr;&clc@fQ2oSXryZSj>F)Tc@WZFFkEocH8!<Yu
z>>5qq5w;fXpf2s`M0=^tWs`W}yPm42y>;GQ<&^6zcP)B&7xBr^tIQ;u)*8`w*9m^p
zP9xt<$E|_I@H#p%@#PbYYL}gUE1i$zbCbJk&NVxY%Rj8z5i#tq8DaBP<C}s;l?UjX
znsUm!)#5Y)?zd8$b}it2k4$|yhYvkX?CGNDCYy)T)F@4(rN&FRIJwC24lm<=X;aM#
z#0_i?80JeCpb5eWy&1uuX?QAb0UssZOwM?Uu%D*{BB4u`JN%C;rc^7PES@%ANo4V1
z&>74xE8Zg`KY+B_{lG%X>>fU>1??!KA|pl59K8VnXf;;J%f%HjUSvL)k4bb3nU~XK
zOzBP>&wC#8{3jm}gWJs&x@QZz?&L;>`cXxma}<R>WhX<0?{*u-Be~f=QQ7rd%LHSi
z?0Om^d8`ork7BiSJfn!M>Qn!?`lhSG?Gp7#4%yUqrYZ{eo(6Q9Sfpm1+GF=KxqIaK
zeYR~Vv5poYbJhuR(hfJ2fSmdlxLZ=Ps?ZOvW8zjU8k>t{|Cq(wZbS<z$Jt*5?UhI-
zU^ODw178;GOE~T6P%gW=rODiX5c(Fhv;$U#ZZ(SFns_>!yFA?#7#0)UPJ-jQyFVWf
zEb5XflybCUV`7*KQ;g=C%`q)#nLU<z(jumS%(;5Ki4vs#k2H3G_AZ=Bd4uC&_9>`%
z0tWIyTp=-WK~-uGU+AbJGlMfhq_?_Sl&g1Ee-Y)`W)8a?H6Nx>wrg0Vm$$c$RaG7e
zbLiAgiwGQZe#cHZgk<Z0^;D0x>U^4`1uJZpU^a8Q8r&HN|2kRAWkX6CcVE<8%Sjf$
zh>q*YiU;@M#^NJAzu_A~^i<m-)}r-h8`Fh6nS)I6l{6vzUf;?6g+ugf5veFL`w-WV
zeR)vHrq<5{QR-aY?t?0*TJ`A|A5OyyPD^o*x_|>`(91i*yT@Rm+P3=_41Z|WbFqCQ
zSQsDfo<^;mHP%&4M@0@N<(B+x1&Q*BR`*~<iLhYdsR&w3m)vnn`KDFe8s$&uLd}>-
z^3d!k83m4Vj&ffKN3qlH#;V|FtD5*7s5#Djfpi~r4vF?6HT%R4xk#Eh^O0!mSkHVm
zN+2aQn1E%a$L1mpCNA`%vg_Axh=0?!G`*dC-(!$wWT^A~_Z~kcYrzI0G@I#qll$7H
zw+CT%&8i|6)9Y-OL2K<8PEK<0qKra<A9u!q6d%M6i<N8wuJ$;vMNnTY3jesA(>9uW
zJOMRYG)(Y7#7f3DmZ?45KF8K&(`o+pw>Ag1TYcKcAnQvXwb0yRyF&8Ts1^6g8hsiG
zCF$xH2s*nx-^ar%*c48;F~;j)G^ZP3u-(fJef<e#laT5Rh{g=Gd}N)z<t=ks2e-4h
zqwpvPXUWE%b&bB^4>$Zs!_PPTSqO)!2%pYc4{e-mZA6MuV>-rWjSX_YoaIKH!9S9o
zcb^<g(qEHA=6`S6^`AJz-<V)iZht2k^mb<}0mYi;EFu(L{jN(J>o#k)W7jj}s_~QO
z3YIG7)dStbvQMugU40JF$EXPjPl+6p*EbwC=Oc48jju{V46UH^Cr*`<Tj9Ch(7oWL
ze%&kets8Sgx5(U8u1wn3bnp6)%Rh)^ay@Z&vWqVzyBtrI)lYf*x-T5UhZwO<A{zH)
zP+}C2+7RXzA5FhC<}^#jKFWXd0|~{B$9L9^3TBq)<0%ocTkp4nf7HA2B}J<jPCyl2
zy*LWGTUQ`zR62+#B?>?8nuMZEgEjbSSlOX1UA^1BOWL-E92tqNN`x1yC%6)*;>8L*
zTA{h)YYa&&<5_pAH4YUW>KB<FyFLs&g&aR7L|^Zu!a~)<=Zl1Dr*`*qC~B5GcdTDN
znm+E{W?ZuD;S%>UOMM*4|1C54O?Om=UcWkpI-fc3-p}E|SA%P&@bp0*aj~Cdk7wf&
zt3~F5r`*Z0Pi?k`)hI9zD+Mw3iZ!e-PX>Q+oCJ`F4jK`A#5BSCJ6=-hSG^Fm+>0IZ
z>ORJUvPv1jDdDGcv(yv(J97S+lMcrE#CK?HuHwsNGyCCnEb-C?*yG@r5KzW1i=Gat
zX%CzI=(xS+Tk?`Y4)654`Sq;8?PC5<9+oQy>%|Y6fg=6V8h@;(QE+=L+nI~}l$&Ck
z9i`CJylEggpY?HNbFMOWDq%32q$djqyCA_b^h2G*3D_qs1$6XQ4y6{Yr|l?IO8P3Q
z?XVr3^>76u1WZpxOi;v~!C@eLjYPkp$}hU8;Xq8R&EAo%aYMyfmX1v1u5VEbKVf89
z&Z0HFm$9h2)cP#7W6ZR^S*6?Yx27;D#ePEu=w;=&VAits;O41s-f#kVcac-mjk*-y
zFL(Uq7w=t>F|+F~>Jt1wlu|I$`q)(=1Ug~i<}~$+Zdt5L0J^CR7z5N?;f&4dwUwXu
z!LzfM%#Q=wYZU0I%Sr#FIk5#J3)4#~|4D^n+b|86*8!C=w$0N-X$8kG=%rjPZ{b|0
zey->BVDTz#WL+Gr($yJUDSLqnlza~FPg7xEgiBe{4~Oh+3OqTxY_oWE+{8M+3=cML
zE(edNT1*9tX14EA$LR+89p;<S7cyoP;NOPfC4GIIg7Z00N3Y$0%B<Wi7B|V`$E%3t
zHu`r5Cxta7IowQPd7rZkh?9u-CQ$k~amN@F82c3~({z8W%Wiz&_X5FTXSS1z@V&N*
zQw#2nCezd^o5dT+Kl4WP8T>w6yE1yjUbd0U9_W+fPN)C<Tdw0LY3k%B-}R34z8=2u
z`MIwFo@f?MVGX9ydPQlYmhd#~ngMbx_ty;=`h%$WY)dCl25@bo?!~qr+8zzo8*^9=
z6`Fi|gPE6?hsRB;ettPTd~E3tTmL$GWN53g)f#0JM@5X%esKo<i$%o=?{-JfDqSx&
zDBLSIU?UMc_7kzX?c#k1!NMbaLLg}1Mn7D)B-2NJI`10rc9Ed7z+(kvRonaq=A(nA
zRwXv3L1!E$cyxO9?y<>fU5BXbwEY|zIg`uBq&96dBi#9);)NAbgl<pBUQ<^c4|$_I
z=HSc*O;FTyOF7vZrmVn1fhy>3Z7fVQM>x;k1S9XIsw(_jJqEN_t|a1smzNqbvOk!S
z)BZLPVpQ*YFfT96%?;h0YYW9XKHN3x&nlFam*40V!8bH8U{=j<$h)N8taC3@y(AmB
z+5)E{0RaJC-oXRj<@PJ*7F&|79qy+iKEdtEqXHg|KRP}6BZb~3b$gnw?d1~>fER9L
zeea2$%)JOkMY)84*+gY}q3^-(TL}<_gqBbHtZPK-9<3OPeM>k%nC4LBWC?36j}Ffl
zr+i}$&LLE%qSDici_w#1-8fOqo4S2CIGLxhCHRJ=I&4&u%J&^%MPrI~@TBCkZqcyR
z8-ywm_5AS;8$*jsw(v5#XFf1r1k7|#&j{KM9Hfww(u%bfCa&z`vr|G<Yn3xj&TSbn
zyG<u5^e&*h{-A19b!mQ{I)-nXn{RZXtBa+$2hoXe+n<fR6?R8RC~gT<rmL9LC4-6W
zm-)ouPXZh)20AJ$D+2-oKr3YFs0WRU?hYHr=OjWWd56V7HDb-?uCMowht*H}FVmNn
zYX3fK1w`VZ%swKIm?AvyO|3K<Xwjc|S<X26c<HI|XOiu7L388B*Qy_dS(e!D@o~b@
z+|0=_eQl5Z+{3P8WhAEFXi+CteDcXBUGAm~AOusinNn<hT`QOi?qHYLu5$Eq6>$`Q
z-?`}skflTkvu{W#r~o3-F}(6oDBp_(WkBTg?WwJYQA7&Aq<a^uzrRgc*`lVBqf?sI
zJ%uIvj{I*%@fJr@Z#QOUQp{JEMLVc~%)#5+I|85nS<?EjRgIzdqu1!(zT-z>fWR0C
z2QJ4Tj|QWwpe=*z+`Ej9p3j$Q4H^glOIU~N;q}ipW+S!Z1e`<F^wIo+Ecw^=deUCt
z9g*Gu(k7%YB$HpTH2xN%%K7Kk#|j@;7g-d0vNIBj4l2MOk}2A3UwY`@Jonz2K(zBc
z!}{uMTM@Ra0PPEr-O$^4LiF;|z;)pPp$$T5gF|vBNEL?PgWb6JzBy670mIy1^qU>a
zTaZx03<vMAadlVEWSs$(s3oj<SU9(%$J=WGg}E2RWvJy_{0gf-Hh3y~7=IDoxM97}
z=}JT6so4iLPqX6qL_|m$9u8q*g+a%wPY;(etgx}lrJq?=jJLG4Nl8g@Mm^tp0E*Rm
z(~DJauJdOOfu1IB-@d&A3D(Lvon`Y-m6IbNAgF_NY1&J=xgELhjz-4Cv6{$k`M*m{
zOblHH*4{vbENDC9=jvzzoFIU@UI)_hbCJoYiJ|%F*^FTSYex9q1|_;H3Ygg*xA1kJ
zNUj7x*{tF)eD&;{+*$z!G!0;T0n^KWN-tX4P4j{-$V0cv>Cj~t(jBj6T<KP|YF)H1
zNoS^%Uk^VJ^J_1burU_MO!F-8l|=Px5O@4O{k|lhIZRluK6S`7Z*q%7f~}2un?A)b
z<ZkFZDNETt1Q$dYsODSzZ<1$ioYLao=ceYaZB9)=EyVrkc*=WZq%^}~tLW(T*FzcX
zRIu0A@%ert-iQ2Lw7+uY3J^5|cd*U*4#pI^9sXxYHa)3wyW<|h7K;zA<E%tXOiYOQ
z;^kEUjkfV?MJFd7_b3qG%s^E*!&>OceYA3@xh-Q1=E?RUtHBz;$OLJJs7Wi{{Q4dI
z;o@JAa?t<D@LT41h1;ahY=2fnmhco=`vwb5t&w8|Og>IIG0>bHwz#xbN$@}@B5)6H
zhjJ_9s{Fm;cW)L}Zr;PQs3g>$B=yI0NeR?6S7>wKzAf5OCQMbMr7ANi7D1;=UA#_>
z`yp0zVdl2I&0T8cQ?6gAu;ytb;IBBsqbU{$V%at^Qj%!x2Dh$xWus>@%+m7bk0_Io
zMG$X&`TF&xk<nYCanJ*$ARb81M-b0H-qQsgKvuBErJu6C94@v1uIH7it58~6+C8p}
zcE>k)TBQKJ(z!efqyS=K&_D5KWI7ekB=i*vfYwrQr<$XY-7P}$NYBsbw7qTfYa;(p
zxL8pBR1dLs_Hp{Bb_fDs=+ht=4JPPOoS<^u@%Whqk>)z(&k`AfS3KN-v*@e=MvLT2
z{01kt`-siHtRgrAPmL3vfp`B9Z)X=9LFez7Cq#%hNV_bfX^uin{bY38!x&||k!rd$
zO3yGZ$(K?2>#*}X!o#b_pAQOL_t806q4b9}3zE6{_kR^@XW1`Wm9gLb>hyw!wx*E|
z4TTT{EyM=SFNJzY-SrVzZZqE*GcwF%;zms1?abfCl7(h%@vyKas=$<Wh66v0I7Q!n
zni?WvVuJ@!105Yghwl(EpZS1b^)))K1n5Eohfhamftz@0db+H1@du*Qk}X!6+x@`4
zU@#_jw3w0@)-3Lxw&Y0!!x(6vlB%$i+){XX>Z~n{xNo3=E^+xRv-ln2E&T;A4A=GA
za<)V}xR+3C2&6zPb~wvOCGEUoloi;&qUJe2<0jw%<NFnrf6VF{c>s?zN(OCYW<E0=
z;dzgVEv;p#0}<`w;v#UsBLN!QfePS=JtN=rtcI@83fFbBjoZw0s8|7r>Y<=e5SXxl
zO|Wx>3i)lvSn;sQFaedgoVgdMD{Ms*i55qwgSlg>X?f#DUaY<D&7vO<VVQxl9-DYa
zA2yya2fsd6y<6*ftZLvGs+Xiv2ezB_8r@2YW*xuRvE=d5(d*^@`G6c^QrN0SO#$fY
zytQRt-5v0Nx*MIz<^&?n*#f8DH47Wph4^<&etstaXh@&~FZPHZ1+g3QZgzbHKFI#U
z2c_vX%I$=O_BdT@U4L%K#Fzvp--EOvuY{Eie!T(_<rU4P7t%K>9w0`RQY$3gQxF}W
zpSNVL)nQOyzI?fIe=Ztm%|8S+Kd7;_wqBcSi`LiI-`9jkuteX}1%8!e0}i%)kOPPf
zjM|G+2s92Q({>9Pt+pB$8j5e+JS@F*MmwS1co=FEb<FM25=JB_cpJt1X(C)c>rxFJ
z^aD?bF*~u0{Pd%Rb-08))jqh}ffJU(t3X(D*UGgW*6py+u8oZVb73yYo)DrVX+1p+
zOUv>$_v3}Z*Ldam0&S)jC<-y@vMF*qu)`Kd=i@!9Gc(_<ssR$*);3=!gojsH7)eeX
zu?B%|0(p8@Odyke&gyQ#A1RTQu50|Y_vwM7;wnY4T^eNA<kH$r35^I-pz$nw?f=DQ
z0?GjZE!IL8!v3Y>RpxEXmbg2VOhD5f2Oac$Sy4G^IF1inYz1u{>@jSb3~y5<5a%)@
z6=}8@MB?DOF$T5&$OnC|Z=kiJ-V*U+<(kTp*|jrH0dL_JQ!&XbG34`ub}5Vi6gW}?
zO9#Cd!U1dmUV>ZV(x}EIy|kQ1=a@F(&=0>C3;S1%*<!&;N(#o6hbg~l6ueuZO`hZH
z#IXC_X`R+<Nvvi&*X@00xBGmCm|V-HxrfOCs6v6IoQD~E$;b{R6U4TXTW`64eYHK0
z_95){3CN$>{O8ZSI(Fje{*~3hJA$cKdDl=qXT-=h06MM%Uv6t^J&@t%7eEICM5*a^
z)^^%ixPCq^YtkvZZnR@pD=iMex$H?K_=ddto6({GT5h^63eGX8#to*CQm4=GJ;*_T
zTDC$-kCJ~5w>Dq5Q@(;VXlF$-L0pq00U{&v-=F*h31Pn@#wXdv)j9Mxs;Gp#NKqs&
z%y;S!;7{+fALQOsp`EMDCcKm1=$5v)MZo%~@)+%yD74DWB_QeL-fGI%g4zD+qc5TZ
z#x)pUF@Mb^C(p@u5a?GOp{hkktGx^Fm)I@riFVe`<Q#XMa`QsZ$9jBdnfA4nsK17l
zIM_EMYI*h$gfEq7Ruyvln!$BYaUTl?Lf`0wWzm3geLLpgi`O0}cvz1wytU<PEmZKb
z4RpiFND-xZalW3lWgp@XVQNi>a|dQy=aHVE^&4swrP_R?oclZEIelQ#t(tB+b9=;k
z&DzG|ow6~f%$Wo3!?Yh3()DEgRx!eJuDcCdN%~hpbl0mGtNXw2$euQ#hdzm!lQde|
z9oG;pO#3W4_kM6O6HobR7H-jFHerf^z_*OdVT<q8mexepFBKqC`cv1rg+th%07)HI
zE!H)ZBCkQ*n4oi765io1bAe#lV^le#i_eaR!Zg&$T-wa;#q_wanSkmN5yjkYZ5T0d
z&7XcQUi4DmaJq%91;fAI`G`7dndrgCR&Lff<7)e+46A!USK3=;C4Uva!(ZeAW&7w$
zz#GTLZu5icqO`mg>j6gNo8B{pTwEMfOqR>38*g|Xh~YRim|_VDmX_<W<v>^yv~GZY
zbysw*Tz#chw}PJi@UxQ^Y-*B0MtTd7CZ(7WNG-PLst&Emjo!W3v0dSxb_3Uyj0F^%
zxW(5e$kyj>!9r;50jszyJo;^0VqBZ~bLx$CmXW#6r&J&5#*_`C<eNRF%;SfBut6QK
z+vEBp#x??T$?TEcG=>NQagIH}CjN}I0&3WQeS^MQQ^f1>*yIPq`0w%moGxG|VUcQK
zQT>Rl%4Pp26;od3B#nSYcyu%cp1D~zU&rtyxn0AYH^#Oszn(}Bsj7(k{?^U=4HkIR
zguw?idtdKqg;A$8(zs6(QS`YUu?=bU^~>+X7bkrAxEMj*dI?;gC`zwTQb~STPRJ)X
zy~3%xu#tdTdQ`$s(T_(em}fxrTH}NmA(h>W7pH>}bLpG05EW@_GZB$|oWe;LKSuPa
zCf{zoYWt#vSiIqR-ABM%hv^|o57tX=Yfin!up4Amwl8B+L@yWOU6|nT{a9A0fcOTj
z<kF^;R_L2%ZmG5`0ih9u3OW0Dy`hOIeX*GFuEZB}9{U|$1_jG9N5vRZ(R|g+=<$!?
z<5%Act6V}&P`X#)NvTq40=4vyYn2tKoxX4z$pwxt9yVCriE0lQq}*#1vSGU_B&e3d
zEqf0ylHY!;%#audaB%G(>$~YU6R5PLD%V@tUY0{2-e%%hIeo7=0~HTNT>bokPZKs1
zttGyd*bohmp<KUQlk_AP_430k9Y?fXlt#1}v<qYE-Gi)Zd}@#2jyP?D3)ezdk_c7n
zmphwo%RipFMir-_dJPiSYxjhBpP0&$j!URN_@t<)y14&iyxau7Ra-0T-II{Pduo)+
zSq&4NSG{*`CQybmRvv62*2Aur(+)nGLqXole@!i=tiZ!te~y;`qu6h<S>I;zo878P
zTy`V-oF6_NBYl($-}Acep%MEmd;DL?OUz~Yom7f55GP9NIGzbHFCesR@m*@?Mn{rM
zzlR1z1w%vmb7+a0>sv6LPten0%ecT^S=Z?<UI|dP(JXQ84h>fMzLD*$q!9(K_TjUp
z_ujg{IAVn){NFe3H9p_P#7#2()Ys$_@rioDk7<2=X^q$pwDelR$*F#nxy)K`J&@`W
zbX_#u<X+oXM2?4Mk5m~QlC#M~xg^W<akRGW;W|q2>wi56z)dqHdTpjPR2EW%L@0lK
z^e$Q>ngI5NkKTjeb+mV{HA=5LE1Cjc<BE#vB~-96kMOJ^V7I)!FngZklWPV|RMh^i
z=j<VhF?iC?jz!9a`xUK5#OV-YC#qcb8st^73-SbkE@~8)L3>+@yeU;-#QK+k+cLpd
zjZvA;#0?{LPr7*4_&*_kH$Ytpk0wAT(2o&$vw*Wr!)Zhtxww0N<1JwJ)(N!RwSs;m
z9qq{-O(UEm-E^pTG5-Cz2j}eUE2(~L_X7r48@B4|g4u|sfqP*J_Q5Dr*C^#^|N6&V
z4FQauHM`&^2a)r2cE3hVzw)o29OQO{2ilI|OkEflk=-%pC==y?%%uQ~((C_yT+zQX
z9^&h7ymwBw@YyaO>;jhk71o7o<s7x4fIm9ev-8y+GrI^`=W}ieyAdts;{9+o8)(RX
zO3Ax78f8L`c!B(v@h6j4zyzn_{_{gHN-rI-y}Q19+K-Nc^6JGkrZ_-4{L<iS29uYL
z`mZ6LW2H-$0D_^Q^neb9i6p}Tsg<@bHEcBs!6nu6=aR@wlCj5P;bI6c<>!}pBp?KB
zgdU-RcYgp5(9-z>ztWC^TvlW=Qk3vKr|#7eLc#EJ!~u12dGe6qk8QU84~PU+9Xt);
zHE?VI(B>N{cMo~raeE!2J@aq|<#Em<Ze&j(l<oU0`f%`<^84Mwm9L1pJzxB>C#MNq
z0x1yutkSt{Kevt>X+!y0l+&%;tk!n$7w#|M?c^7gEw(mC;RPqM*Vt{P6!g2>zKR>p
zCB`5GbfxBybG1<}Vq%`oJPjXMSEDS|Yj{`ym;u(uX!hah7?^SI+|Cn#%47vDzE5*d
z@W8n>3oVVZ4P-=8yAJAA_t+nZO}EIql(1a8MKcIbN#QYVVtdKA82W#UpNdzaKFc(h
zS7=p_b3bU}Un`~zAK$+27HKeW4j-X={x0LEPdaq<-%yJl<lcA1c$1H6w$rq)ALOna
z0h`CxU-o&CrG5R>nLyI!=H$jIcCW{AA|RxPjc|&Rg}_-wNe$gp<P1JKfq-p&soCTb
z<G*sJ6v&H3bYNj@+GBFI`-N@Qx=PEo)y|8k;TRt_$1xs{AG@pmRnMBSXzQn3W!!Q?
z_>_z-%x{V}m{i+;g`Fzhg%%r0X!vG)?(<PgjHJe0X=)_mK)L@Vg6~MyJ${b`H|eB+
zrA=`@c8}@WVJvv`Tby>?^9E^<D{*%`t8`=c(<8h3!%-pwNq1mli6YCyBaO{Nc44Qd
zuk#CVr4*H=ts~+&<3M<UyXiV19#B1b_|fTK5K2V<ze6aQ_?A0k?ytSo|2K}ZXg9Sm
zHx^JD>64qqv@y8m>wfuE-0qEf-6KKYv0uILoO&HvjBE3QlNAdLI(M>(VFR$$rS+k&
z>C2fT>^&tNv6n*kJ{^QPG@b6B0N$1*kOkytU4s`WH`vgQMYpFxxAV;>uUL8Lb>HVr
z=pFl)Qx-`HqPpw^L#}~To6_WYc-*Die*#5+lN_{Uu7fn}OVHba2%U`h{8Eq(0?lFA
z+rin!LjTv<4l+H$8*lD$N`Ge+I`xyUKbUx3Q|*ZM=|Uk_PoGD*cLibG$}>0Y5?=cd
zj5)tur0>dlJP)9yC%vAALM$!HpJ)H5f6JW5aKFHz!Pzn93T>g_x2PZb{Y;efWL6$7
z#;3mMvQb=$;&->ME|BNhGKZT5u$|^kYmCS5)SLtZB>e`;hR~IKwo5TFtV;$4xX0KL
zv|w(2_VJe1F{IX{7mv_1^iPO4ck2w|J$4P4@<AJJIE(|yp*+FKM}j>;1K09SM_1q}
z82&Vtc6;xgdhqMAaC2IMyQgwr=D2O3BgBlM;%&k)q%Pa5AX|psf#!W9iGFHPxr8J(
zC*5W$hXmGq+~aOhUOL#x-UF>=L+_5e<Cr|RnGO^FB?I#v2?SqWt$*lCkF&&C8>v{j
zwZ0G5B=H5hoKw(w;#M^_qKPzMN|%hyTwqYaL372w6g~fb8R!@YZVd_)aF$XoL~Wv+
zFwsyJ6|Vt=U-y>ZYxm1?I6?H2_qjPgT=uP5xn^l$EM{QZKvG&X>F>!rZLjttGMluD
zuI2c-)q3ttCRJDS(<ZRkmW>+&C-_N#3qu}R&~Li_xV`p^u3Js`b*w4^Gm0kRF=0)0
zr8W_(1qP<e!Y+rentlDOZk(r_7hCa%J9=+aGG}))oUMQhas{w1uD~2fF&F)vV(w{?
zdSS;D4AfOhO+R+6L(&Lp81kMzu27~mtOsZO*^S#hlV92WV{BI_>A=Uu?W~H1<93~D
z)FW<tZK^UZ2lwZDA#{XmCf#4(8f_268KkZuaQ8h~4o005U$A~5q8e6#^CL0n8|m23
zM6o3RFi=oBQKzO0-<y&1pkH{j0gmJ|N-s*>X^q1@()%-eEFt$?;^~c{`dwXr_PI5V
z7J*b>Wg@G}DWeGPzhP@3?sggjv0%@v=-m1fJ@E$i%MNxP5wrUXr<<>HH&ikPHUO#W
z{b>sB;E`i;Bw8OhbPJ(129i)IRcarcn;$&Io0Xhv)*)Al;8xT`$d4#w@(kH#IfLxD
zIrVkqC2D+m-A0W~czrFDkrDaC>0Wr^0Pdb_PhK>4*54nmJVFC6#N80iN<52qv<&ok
zlaotN00?f}99^@f%v&#mZRKQb7gaSKy+ieIGIJvfAD|W&dO-QiOuuSI6Z6Q;zO|-X
ze+@F`S%Ue?#HZmF{1WK8Nz&l#JOL#Z^`*?Ahxv9bAlcq(0}f5zljs~5p97zq>-tqT
zN>Xl-Ah`>$faCX(B9Bo<jMPfI#e(}#+}$m|SD_ak&?Jf)y7o`ZKGeaD-q_=P2F~TA
z9UtlTw%&6v4C3_aV#W2%eKkvVy$uST&XTeOycBilQgrR4J-VWJ-BIHN^RxS=zi27d
zUyrX}1)7W<tnF>;-r#T66l~huq#DH75Q&%(x`cTP8Sgy4Os~ZPYE#KTa3c!P1^-eF
z3qGA=>g$o2dslnH48RTq<TdCB@N@rh>jQn!;&Ckp;-q9$fHEU-rzzR)11J{?4!<e?
znAm=^W2QB-or9u!C-B{5$OEaAV0l`D`+(kYe_VgN^lMRGV)C3X7(!|5B6Hb;?PW6l
z`V6aU%W9$D%S2q>1ipNocwoT~H)YinK2Tnd(hkxcMGOEMOLSpvz!q$b*V;G4vA(Wv
zY@a8BLwgA3#fk;ZY#RfTroNi{&z@lvDc$UvrZ|k-Sl9iXUTI$34`Jhrw*<;;xi?9X
zL=uUtM}GUB@c4@%$WFfK!wxOsb?p5}&#66%bh}?$;oyn^zp+YmD_kKVvqgX`m}IG%
zD{4pEl*ip$OU>tKh8Pk$lCP&8GSdq3v{R9uCsiQhEpYr4X~Ie|^+ce>=4a^X3R)-m
zg=0Pe<q_bQ`TYSC_6<>F=9hVSE}=Yh2S08Kc3Sn6{nogv(e4o1r$1OVJYS;+J9}ay
zXdqzV1GGyLX=YKCP(C}@RBFza^*ipq!Pt8;g1O`up6E6HHzgiA@oCsseqK^Z2hD9u
zQ7axhU8@9lr0LTbKa60ETG9&?)H&rhfn|<AS$CR*4d9ng9Byh1F=Y{KY&wpL;63Lb
z;{fa;o;3lN&)Q8V0ACek&ePTk&uFQ1AQwJjW1YXY0Bn9EUCFEXUcf#2A)O!5`iCsd
z1{m_Ve!Z|fVDUgFXlq){-#?SXi-(|N#ueda^%DmG?eD~Y{@8)|*_XM(*rj;OGy+%0
z05+AfS*u|0elV8uz|q9B{lc+2$NH}%^9aAao&b5TBVH7|*9kg<UejHVBLDb+pP<=&
zZL&S-x7UoWoeX<+C`gv3SSONnf$+jFKMSKUN<ZO=AZN4uvIpL)dO&-TuXLgI{ac9f
zDkBeP^3fVz?v<Y&L4+FplCYCHKo2uW&N&@SqcY0w%3UG>2A#i?2B;@DUIqR8r&GRv
zgPi99uPPWLjc&+nW8>Sfuh0GSV9L6a^I2Jj*?U*fMGQ@sK9$)~OG*UT?e8L6$i8-p
zR04u8M^<m0E2o$L1x`TVk+SwSO4{4!HMMHm9H761TZX!2x(7GRv|&@VXEDSz-?fR;
zEJbS2s2#{LB_o&@Gap-8t^z+vP!A1+yWi^TwPE{)I|cBUkWG+hE6ZE~{-y6j=;{aH
ztfK*vz=|_2B=7qAqgBB3KSItR&L1;KRd=2q&Fr*IK3J+YXX126_#-&wjgr4@(>)Dj
zl!RPI?sa^DHyCyNOUVBAprw5QV^vR253ndrNJuzD)BxpMM0j}KhEcpbDG7;Tm1^^j
zQTT@&&I^h2{0etn0G2CvQSu9fFj5=^9|hIPswzV{?=InKd1{t9t>@3M_7V5$j^@RO
zgwQ>HjLLrllL-{H;(|61dFLXQOJmeWe@2;9iGXsfo*p_0o~0y{Q3)Q`<wWWf%NBZ*
z$TZC@%u6>~D1!nofs=-THCZQ8nE~1rf&L4}2iwy-z}b)<c&yOVzfsmf$nW7Wz2H<5
zHQpr4(=2`o+DE-)uZSJfD*-L6KsPqf<_S1R*+`8Yir@<pQ&ZPCEGto<Y5M#5@i;7f
z*zTBmQd9p7=rJ?Az2m##ett9_Cw8|mDg4LDc{f_X5@2C*b;4)i1zp{z(Y-mTrL~C9
z>Rgu3)hisaE85dRmh?k-6*ZO6Ui0{ve@skpK>_sl$UQaH5=1WC3IC0jrW%_9h7IxW
zS3q!bgL9gUy{b8(*vNfop@+`Aa2&1sjpJsQyj|M7U!E51QdXN3OH(Vj^wpNyD%tuz
zoky;!S3oqFSe&-eu$_b5IGy;vg010``-crxwVw#f2`T{}gTftJW+r^-KLH`r<)AR?
zh|^S20j-Wfm!d}!jpkg-_T}ekUhg+=uH&HZSb`o|mX?+;U(R(UOLljG<jjVOdmn0s
zs;ulo(y#S>1KV;*&+_6yT6A#G;&gqa*kafd4I}l3n7DXLYb#-JK<}5GM#}u#pXa}a
z(DjKp#VkO-e>WeFi2Fcn#PKO9DLx}RThQ6oNK7nY0AcXbQnlRP-DbUlI9<fnXu7Eo
zNF25fh^Cv`B*dNhn9Rr(fmxJ}Vc$)~W!Whe%v-40fatXW2PF5x2@U#mBpsalv#R;@
z8;D(JT&D9vm&aF99+dFnYOhq<oP0!#61fIi9tvld=}sOds!>vk5!!rWi%lD^32(JX
zUqFWOd>QqMa%Lh~o$z3E$LT3didWYwY#06#<Z?BfsDO0Fbgtr-FEwmoQSNq}YQf8p
zSDnn$&A~t@OJ@=q7RK&zas<2@Kod5(s2IG#h;tE&_mnZnJ31o9Amrp_dW``qE9R`m
zd}e{gdU>WR*r#*RR^Y!juJcblq@gW^h^OLFwg9yPx8=xp%aRTMm{}-}Pph9FqkZ|$
z<Ye^)3F-Fc&H0qDQeY&kxZTe^HgNYRhn7}3*wrc0zoyy(jzLzqF1-PmhqD9K{}Z=}
zzLpZL^)2gH^B3+~fSl^}?LZ2I;`Brn1pe`z_i8ZWz5ve>Y9onsPD&<98JM_`*p2q-
z)9)Jb@1sL@{>ABcys~pZL=%7OK2H!$Mtst3+kxp&jEc>P?b$5F0D9$R_imO{pp;KN
zpj=Gsl+5%+rt_wXgYL@s(Y?;R>G(`}$9w@vTTa`69I=4l<?w`AM%+-&k#<>gJaY0$
zrE5}pdbEcv%bhHso1Fj~NKEFo80cvyh4+0&C@3oW3kHQB@vp=~-Q3(BJ@TA2Zq$VO
z6&&Gdic21Drz@sKIDAI{1JRGqi@sPjI804bGv6L~AFcl3Q@M}k7b%s{d!USUAG{=W
z6>_&^f<Fgds(u|uzN0P^{FvMiY61ivIWhkAbAOQD0N7LYcKsEfPPsq75S%WDSJ$XK
zk>j0GFKtVFtC9bAd>WlUBZg|R7owthOF5$UwGWu@?C@2Zw=?(F_^Eu4q+jg9g1Nfb
z%?kq#yEekm?|cN6D9)R67DI*p;SPM7CW~#Sj^u$&3kwSY0a$tRc^{ZHx>ZcFM8fq)
z5q_eNK`S1aSiasikVqZ%JC;@(%6S`5eVd8@Rvwpvb_yl%E$f(XbM*?nU61$<vXkEV
zh_&^3F7uQPWiv7w(deds><5zPW+E15o4Dl*z3|{26@Noq{7I9Hk&zy29d)~2=P^8h
zt-07h)G|asNxM@M{U37{i}!3EkUhhT_h&^vQ(^FAu(1f6ybnpV1u`2@S9QnMH{n?J
zTQK_yXq7k=)_5%Kn>Dx$<$JfQTJ7fS;^JaslV|wM=zjYKcqDl_w&be2B}&6pe7RKC
zkND9=lBIXet*ucM3o0eIEWogcc~?5h-HxX)jK*J$=aKEK)l$)1rdLA;(ChN=#4gBQ
zWeRus9|!%%gvgPh?^!;CgHr>4z2V`=p7rl|w))$$O7-jqkn(9h=9z%@8Co5zOe9uO
zS6WzU@9;&EM0p6fDX0xL!G0|*FXT<vNP3KJ97HcNJ;T8*$U}}<_Q^X#7yZeI5m30L
zV<?eYX{tEP5xEDEsCu5qrKDESk64qvX&LhF86AU3c6N60Nn*@L?Wi}>K&evjk^xi|
z+~IVPdeU%BqFFymJ8JZBt2@}o7yoMo*v=Z2Ja=v;bbumyZu^B=aAERo@dM4+;vzP^
zsBs;l6bJxjZL8Dgs+BEQxkW!4241A#4Phy8h)ihZ_s|2kut@rcb6?ZS!kc{$Ax!#n
zuBFTl?;1!kArP-Ix5amRI+?{9kMTRctUTVn!pn7QId7J-V1qIbNS`or`w=&o1&<0+
z;_{YBgQ{RgOYCf<zt{-0F|lw;6;8an5BXeH_T1OInR7f8OYK$`vg0=<p>NcGa(~z_
z1^$V^sW<bUap%ogX)*!=nlVj=_M9JMZE!fzP)FgEl|!GJxw*OTUEhsEVB7~JIgbcK
zL6O@aOHU6l?i&m*8HZJT{_Hi6p6{g(^93>ig}YHv>V27m<_~(YEFOvDTaF%3SQG<?
zHaMJ#78o7;7K{y7?A8DvNHEkW>$rmlQ&#HBUt6pD5EpeAyOU!Uw8sjtUNp$*a)=e;
zT)hCm0DXt#0U5dpj3BE-yIvFBN!u=HSEl_o`wOo#iKqFAxb6ut+ce#XyA_48y=FQU
zs0@O*DzwL{iMpr0E3`H6jSm^sGAxM;G{eFVlpjwO)RqI&1r~r`fDfWd3rT2nbcI<z
zYq2RVQ1;>DdjOGRW6C4-X04j~*tf$+LHe$l<psiK<0%+a9+oHXnr~UPxr{~}tbzI!
zan)R%ODz<Wfba`_0-qWn-{7}D;;kSgCN4A|d|>k-(9rA~OE}$r`nJpQ7x6IAnFt>*
z6z3h@{O8EbX5cLs_B208AC{6~)$&}eIhzjvV;vpsW$vM_2iBZGX8$b4cpnIEv^Hj*
zfT+R={m)q$DdUJ`v0^@rj`Vni7!YO1_vxpRd@Lb$<6C_g6ZYr&0+%2^T1b7pMMKm4
z!OxoFzWyDGO?IrK$*gq^UOf-`{c{a?D3oRLu&lv12MVrc_1yg9C&u`0I%dkn0@a{Z
z<Dh&oP<n(5AFdPtmpXNdl72+xX5CIdw`Dqy>zD;>zM<|LLOQnEdi`+0HObunt4$_b
z`=Z@^Qp$7uRcfa-HfNegT!w>h#tDEcjm3Tm<5;}mEGeZr(U4%!jI}-ds}ByjURBU_
z<vmD(B?tth{|e4N8@72}$X)?rEQV7RN0{m@Hd&lq^M*YOC`DOYq%n~QO-rdK7MRk{
z;*-o~E)*qk#u#hzH9G!i7f0w9n115H#cOC3F%ykg;npLwT;n(QS5a2(&(RjHo-=^=
zeE%+1ZEQd?P(dtIANe&C@6aUbM{ho7@iWfowiQR0x1)OZt^-p7V2J}fup2p@5#}K`
z&Oxrt(2$>>--{IK!43Z#-)p9=Rit)zjTkz_UF>>m%09D+r+6Ihz5ckqJG5Q4_C=SB
zi%NbKnaKi`)Tt<O#?Bj6(CQLnpA=s-f&afLL(?)QWENJ&_<C$d@6bS}{ANAS!;jS1
z1Cx`3?YFU$z1)5dEaEffs@<pBb8=(=n61otE)WvWKtZ3cmB9jRY-|%z%sV6ngPGln
zpv46~gpZhz@G-9*5fKsSo@-(x0lX2lw5WbPB%r41(KTD)`=uo=A@TVaJ<#a!TTh^j
ztLa7AL=<P_k~aK>K8r64eIG<*!p92rWhN2YYCQWPzB2Lrr!MXBw#75BwZ8I(=87?m
z>5t=yv8u)D^fuZ~UAfp-_G4d>=eW2qKP2z5^aupXq1G$@5tlB|e4P8SmfymdBp*BU
z1b7|cF}N}OhJS9YcOM(t@)@W%Eb71h3-pPTKP*jH%;X#jcxqx;0|ZwyGX^$CK|Hd4
zS1m;e(?u9bEk;U1M#pC7Lkq@3bU1o(Zt&MN74igaJ1CpGIRx$xPoYvmU-}<@3-i$g
z2ADR}O@n(mJxFBl3hpWlvBFH73qeRN!rIA^Dn+#QJ5ll;1c|D|@|!~Y#6h=FVP^<@
z?F}&64QpvQp;9#oJVN%y7?@URshOUgC84I)mX+1vRimV%1Iv=?UFx4#d?RG_p*F%T
zS8yA8TFv(>Ir7CE?;6-li4OpoFo3;dl!j(^TqK9xC+~b#!(btC30>jOOdMXadPKe0
z$ZPo{ynjF(2xOnR(5g5fC<t^)yv9T#+47AuFuadXfw%8C$cg?540ZyP$%^OCuVNRQ
z^KGfI)gvR)%9x^qac!rs%Q08+_F{*d1ZrVJl*mvgm3VhA7KrS?V0daYAmA^2erO(o
zwAsUlSA%a|06hQ52Y;A_rh+pi3|cn_jL;Yhe^{Hw;Rj~BI|ep2-MVHw2eEk5?h$O-
zadB~48XD$NKL-hU4`VCzrUyF~gJtkm`u`7UUmX`^x3w)IA;=&GJ%|DdQi760h_q4y
z(kb17z=*)0fPx@HNlJ=@(%lT*jdYAi!@$rT-$tKv-p9vt-tYVUzQ2vR?|ZMk_gdF=
zt!s7Zs27{ojf%a-;oYZjJ-FdwY&Q!%4JRck@Q=N{CBrEwI2EF~nHu+XK^0nD6khs+
zK?AZ-b#OyH^P!WZ{(Zn+ll{o=C96&1`E4=H$-yf6`R2;mv5kemKt7xycRox#)AHcT
zC6xG=2$q4yhv503bVT*+AwA}w89RW0vB>Qnc)X-0q*=w@_BINW6k&haNIkXEc#3Rc
zdz%Ng#WWDC+hi)c4<f4dSv#<Io6XBJA|n%*LS8W9oa=OFV`1ss*Uj31>FEi!gw@xV
zjEy-Cdbm&nI$VDBzdhun+9rvKNLFU%c)l4a>jmn2GO2x%5)!^}^>-Vr`S6YPb?@!F
znY-Gth2%6eqlvAFha%v0F5i?3@}369B+kS<JZ`(soM)Dc(I6bYm%c8m2gdn#AaiJe
zmza_O;K=VTaQHLyXh-9pC_79i`owiLYgm`gJj@n-tghZgl$$%XEBOx1MR|K^F-uBv
zyoNQ0m4Z_*e5@h*lVApH`!2VG1SmUa3@R!dfexDdiuLLy{0qR7zMXsU;@;l{2tV2P
zY0T-_<cUWtGeB63ydA-)Mc&jIgif658V&=Hib-4eyCiEi?aQ`bXj{;1m3pEUy^LUI
zd3iD7Ce@YBGcDNzD8o3|T8s?nS=}dP#ZCBoPQUeVzCJtKVT_)X#D%?$_`9;*U=DI~
zup*L{sxhE_eMWO(;iO|xT~+5hyoa5JF9i%xf1^t=a=dX#No^Ye8aP`Fn!#W`w*BqV
zyO^1y9kh)m0Y7kw2zz~z&gO6T6@V}=Isn&k&<}mJf9oXhzg&BOVd-2XR;0dN_YUFn
zzP-q1K~~ltB;SJjrHhLrp|{{~z0dg_DVWZN17AD}3e=-SK}I-$M;<94z<1}e9n1Le
zAvHfQ{BdDI!WlJF66Hb&!&|l;vmQ#JqyDd;@Yu3Pv(hko&;2WlASEcl{aY5X3VWsG
zjm{-2hp_Ab`NUt(qKRlSNPF-5;z25nu2$Kbtm$7e-*+_YuqRJE+@}IEFUe2MB>VeP
zIxK$Un8+ByK2tF=k*2h&@Hj!B4#vmQ(qI5b=n_5A(j3n9^HaRoFm(egGyzy>!oMBm
zfd^1}KG3m`zXyf?P?-B9V%ia12=F6N3|{7M18V1j0{soI^!ndQN@iCDTieH$i`34+
zyoR#1ZW+7K^Lu*)sR~CVE=-CZ<%>YT97gXgZS@;kigQWzJACuyQHlK&ukBc5Oblir
zSEKF-C`vC2^_CFK?L!~Ife7Ei0>yEA|5FAgvVKI#J=S(}b3rto4}1RMCrl1HxANw*
z3%k%KHz1RSiWG*4N5aB4zH9w<V4OLB8x;OEM+f<1?`T9Qcs;*dHQR$G_0Ur9Govk-
z#vW#f(`39jhI%C`r@Hd;^S}D4Inj{Y1FV&ARUK`==O0x7z$gX-l}5TTi^LP3Mpj|1
z_zNIs9_Vr%>#M`b`n&EV^$?;2RHu+vhp>kWGQebkQ=vUu?!hglnFY=KOB^~Spz6MP
z<0)bC=@^gp75DuG>AE_CK3%Z=IP_w!T<iuja876h9St1TRiOK^;HcT$#QRY%D1_|-
z?`_6OdhM8lGn=p}xf}>Z3;!x%DMrL!v<`4Q6+M0X2fx34YJwW@3#D|%4E1Qz#kaS&
zD~cW!BqUVlHbPfnF6A4=)@YM^{od!c^mmucZi=qw1n`ZdKhU5+=@(aFWCWJX>vOJi
zGM!k~ci*AdVwyErnA|)W*)^{1(tlz4)?e_FhYznaBf$SCnhqgQM-wXf(5O<Aj_8h?
zD(tLHO`;`EOI8KLw^>-&IkSBaff89kMnhH2*%~`>0#1J^a|-d{Vc4$juNE+sm!>Ol
z%BiL<GIT>rm+0;RdM>AU-UO6@tm-LBsOqcW0;8m>&Hqc|;)AR6_s~7n_I_K01E+W}
z5(aiCE~Ods#x`Sbc?-tMeNqB$BeJ^M8*fQjNAzp6FbKP(0KX=Vt55IsONe|h;2vz0
ze{aI6sh`n@eMn?On_7pM+brGaQAx?KO*{;=?vaH!N}vtbV?Ixc<9}RQ1nJUSQkwmS
ziDlmHH`3HQ-@!B?@Nz54*%|}jxW&ai&_eaRY<qch9@F9gtU_+%6IfUf{I<)hSxRb7
z{kzM`%0M3yP}=u|fwq!gtx9^YD{1KH6imK0F6P9820sAu8mHgOj~Z97%hkSsFAg9{
zAcF_kK*>$9Md@cr=&1d6e}=atE+Eg%UGFh!!urSaPUW^}=j#>E=(!$s{+MBn;Bd3Y
z%)`E*KOHd@1@;-DHmbi-=|kDkM-z>9$apsalej+E(|D`zL+4wbhs?vS*e=AO+z-Xt
zcU^53hy(Af)Lcanq=9LLPIclhK<Epx@O~X##)<m<gyqMJ*aw@7-*p=UY|%kB4w}z^
zLwTlladd|%C~j$K-R8R!K{u9HP4$F481eXXuz^mQqs6auHl4Eo>Y2pK%uIjzG^yQB
zdz|E5#7+aS)!~dB04CxRK$JAiNA-e3z<4#fCkhIkBZXF)#I_r-OE2FBAU{%;?|2p4
z05(BiN=3fi=v~J5#ogF9Z6^$7k5(27B37!Nf1B#+D4uZ!y9W&J#DE)o+<0XKwGxV(
zkx+HVkp6Ef3<zKlh&Z-jzaUMR!?cX5sw()!!@Z4So6+zOA0$9-i!ssDo0zC{O_hs!
z089Ok$mS1wjY5Eqx`XBl;PmnX3{X$F*wnhhu04-Lw~WS~w0lOJCL@nx{J#f1=ZdyG
z{%|k{FSgCj;yS@@?=XiqNxJtmUDq{nEeWbS$RFC-PFRWSYO6Zd-Cx=X05fbf9dJS4
z2;e!(xCZrU3Juu9P+H}L1j*@F{~2(d*r1F_=eJLvH#ZY5Px69J2hhHRxs@$m3}A)T
zisqV{8ZhGQNI0*Tx+NO)wd6MC+CCGSfdk68tntvsUQ8xPKCl#$c5&ovBN5n6$2rq{
z?*bKb*Q8tzo<fh@8lPv}zt)rt26_D~%&T_&0+0YPiv24!7Y4ffPWB#RerDbN->(9N
z8YhI$6*(!X#tN4WUYn8nRxnlqOgoz3p$-63Cazw+`t+mG0=&dGw^z$24|Gm8nSVWv
z`46d$PIG4VNIF>D+U)4)NnV#zX^auwqYQr6IJ*#sShRrq$|00%4J4x2%&v-)xLkdN
z`zC2Ns->!hbV6$|xzZDDe`^eTuKh)SpAk-Lg7bluDo8z9=h7`#TupGpK>n%rj|26)
zB}#k-#B}NNIpt|c@W+p^=d*H?O;pZ+(O+))fk>?zxH1VZ8OxQF^<9{anh|wfq@q~H
z%fFCzU1#zqat0Re>0`;prto`GtFRRyx;$Mx^1tMin%4z5Ue2`hE0PlMU;A?b|3W#r
zD|4ScS^=w;!_B0sSNPRNlX(sH;bqhh$BcrT9;oO0V|!67oC5Ula|f|)-(MGRc*8y$
z03(7SAFaqDWw+$b=EtF>15OC(!{uSb1*$_Ntmc@_fO9B58Sj@gsmRd+EK}w`mx*LA
zjAg^##%4KA*j2-@J3&k^mgB~a2NmTHWAvjcKSo3}O@JXyGyg)Y^4|Unv8q`Qa)R&2
z_RV!lD@NIJ4|cw=UKbXH%2U+gpVUkB7)1~zS~r1%=PfB!yPj#)7UaN%`6P)<ye?)U
zg2{F$zR+^mP8uz+KD4HiZ8xTBzg@wu-31brcdmyDzjLCXx18*mQd!w=<qd9(XL^^k
z{@<aY3<&g@p02Ju!Urj8rVe&)IJ<V^j-}I3^s`4ZGRLN1Xozg+%_zB>bhM0(V4Rv6
zjC`tWqi4{>uGMV`8d|Dm-j!XkAgT^`@hquxmyHDBm@JLdyM0o76$%AdU@t0rZ_!+!
z()il4=!sD&yUUx?$Neu9tim-9(1$nM;SsEZL8J6R%a4B**Xf)5>Q*7YSevB$&p1>@
z)d7a^@nhYS6Q>_XnrgPWxjc!w=QZh&6YgctManAqXLS_fVO@w3Fw?H1YHT~v*DKk_
zhEEMNJ-gcfR_0hAP=MhYMj+E2z&Zq@TUz#B9J_sKJx@fGmUaA%9UV{p8r?$(lmlhz
zygBh_^VrhK{u!E0*dbpq!`J182Z~{%YV0I&Rc4Cr|FbgLdd;pM7Z!Jn25CRjEzy~U
zY_&olG$5e;koEK*UjvxModJ2?Updz%fXiLBG3vy`qNx#}kUKt{le={966&tQbSQ)L
z+pAZ`gkD8`zL2i}*#0$Ive<eovdDRv+*#K=u$6_?RaD6BnORb6mnh&%?JixO$`NQw
z20!Zql#Ws1YgMn<wJiU}w*9;&^44L%J!Y)Lo`%mBj8)v;)X2a=C4E2w*nJ1^t>RLk
zb#n9bXMT`(LXUKuoR(WcVd4+YHXv@@ecslayyWkXPF8~2*9Yi%JRK|1WMNS>Ght7%
z6cfYAXs7;c36;7)meKS$X{6WmabVoBUjKzdWXk<;lfr-AzM6Zd9{vCNVK$KZsY*!*
zCH4hyu#Z1bXTA&hUn+(m_DdNngbXrg*7mEN!29|EL9v4$B)9foDhhBS{VrFo1}%8}
zXZD~8^rwprxVia<fOY{Dd|`$-;ubS=weeRbcZRIX*G>$?YTk~Gefsu|2Xn)wHpF{R
zV+Up&kp@-tNLmEFg({#|G(amy%^dydNs%+z!NLL=&fMQbc;K(Zbz>#pw|hu~<Qlvn
z+u!SwL99j@O)eVHT~3dCC`?S67zf@iE;%LvBeuN?7sc+C+vSG}%E|yPBg?*^18i^y
zs>#`({PyXj<G>5!V7eUCeu5N#=UaBy-m=7bfb#3xLL3PMl3*DQx3}JbeJBLd0$X5t
zYBe1`PhmIGK?EdOqos}t#iO`+zo~DKLSXZp{dXz%ZZAxB{_|%cVJkJzP#B@CGAJQ&
zA<<DNd@pRc5@hG*prkEh6*;Z}Dvh&e_c5b89-!%1P66>%92FW#84*`?-@%)Fi5ccf
z!2MHkkXch~&Q+yV&dgST0>bZ@zhjrXP#4fyfJeIuTgu~ecV6wDAe|^G=5&p1^J^ES
z6S<vfZG=MZ#a4{(X%}I+qb(nJ?koDeI|ksAW=&(&DXvAk0lUI&^q=#3ziq`}G#7Qp
z`Sbp*t;pr^1AUd0dcx(Wr_Y9BWwp`-ukx^qZNh;EaY(zS8}t3U^f}s(Gg6?tu=!V;
zYevq$llecNo8j^W{5s$QAI_%PJ6T>@TKbXCj+Z(BG->-&aK>?)_~{0WF2Nz;bjQOg
zT|-X9O?q?OTTvAL+Y!Fm=kV}#PLE=DE^>~Oz!>(@U)jGAr{-WAd1Z-13u<JDBJQ5*
zTO&l1AP}HD>VK*U*7(8&jMb5pwibRCknVV`vTFfdmSWYGAHanF)ARki*eLqhySfPy
z-e3mlJV6YwadKT&76M4riq6<9oOaa6#7-ZSTk)E!4xozQQ$M271-XgaeH6kM-NTl(
z3cEiJBJiShkMgzOgi_Oh7hwWZX#ppuk3u|t%%|AO3}(%~e}Ccp`AGnhPfkv%rdR@$
zoC<(QCRSjuj5Tlm=mNX5e~p2RoE#e~D=RR%9ApGfsxbp5+*kr|)jq?));4Q7zF7~o
z(Md3>k1qN01LIP>7kin27zK0MuWi64VFRdP`wA4Nn5)o1DYg!NvQs@bTPCP>JoBvt
zT+7zjQ;g)V#FMv%oIUJs`t#H63MSrgQ3vFkb#bt<ff2cBX=%QJfoRYPIBY$z;0Q8J
z9|)A_{CO_WSFg*>_RC>F{%5bW(bk(HaikLvnizWuS_kM}<FWkE9Uj8K0)u_L3gi{e
z->Y$M-v#a#GBb{EBO)lQfZ6rIoCEy{vd)la939Vu%(j23pV}3RVz_x+m1}gttP5|4
z^gX3W2FQAx-RI)-o;ISbjDBu(AlL#EqY7H8>YCz5b9Ad&v@CIclRAODOLiA=xC=xz
z4AMvmNy(Y%>GpV0k6)eesy9>X^k!#fdLUmw@#aJK64=$7@2rdm2M4pNWi)qo&U&Wg
z516x(r}J73WY|^h-4qbeq(w16uH<=>7|gau0mLuXWO3mlz$cxCKRRPr4v!uY-x6J4
z1$>Qu*Pcpe_!!7jBX(a5(sW}M{cl}u)jGm*3f=X(`aUH(vgwlMWO>K~no}se^*iF0
zL1TIx9kCLA;b?A9_x_nPD~qxcVygN{;KEP2XxAgSH^2(w8K4B+?7v-~&6h!kQ46)L
z88ffX`}f^+wx{!}BO{v;>w=E{Lq)(1;G^8l?VaX_R?~1W@T#7!>ESUH)VuEiQk|70
z;<EnvDuMC4m?Pxo-DpdEWbB<^wI|$Ge;Up>`~jp)`yl5%C0{tiUw~$#^E<FJr#bIH
z2{w!bq}m#We;NLa)2%Z8GyEa{u`pmNR(|~WdXWG?rLCx0Go8{LfJ27$nG6L@&8SM?
zcP_aHwY+ZoG=p9G^JxFzuadav7hu2UzX}7*xCH#*5*^UU&16oUzsSniAe9-vQv&vR
z8Q$OUu%G8=-F1e)W&C0^N#57YhH^Jy1-JYUVj<ek0mV>SH}wwlpc7%HyPcxXbQBVA
zAStc0#;)4Ayp%Px>pNLjKkMt>-pe3uBY=bgGz?TkKHyfM{#9pqy;A9Y@^?n%*Uz?e
zb;S!hS^6vlxD3#!rFgfds}-^nGa-DGIaOkiVFQ>gm=G+`$U!=&7@c@GGeuNHAV**H
zx$by0i5&qRzMHm=!9mna(M}<@4>wShPt^7)rdD6{a(A}fKzxzyNt=kQMBo-U<R@8%
zbjqJiJ}^5u9(Lm0GFKbH+kxFmA$>Ofopd%r{eya^<|S-+Hc?49Or%T>XkSUVEo-z+
z`%Y3nv;JPAtOKa;f+SjLb_$G(JN5jlO%U^P6ohIE`~45@J){k+Zx1s&wfqADmX&6d
z5b;3i0lWi!Nj3Q>FJ!ofwtR?;e1Wax+qp;r75v80!(aybeTLTF^Pwk>BiuK+!Ha-m
z-dgC!`0Gl80}<iXlVh7F4Z|B5v{xu|jBALCnxSCGL!X8KFY@UOMpzXI=y|q>8g?#R
z7HbS#DvcVbrHxv<R;}kcYaOc8Unlg{9WyrN3}nzXlZh;>@q`x^ru>j+Kmup->|vSZ
zZ-Pn&#qaQX%78{ZV)CF2GF#{q4%veC#crHAz%xw4=XakwGl3&uIM2i_Tg@l)e6cEW
z+Adr#JN}+UNQ?208nMHDfOj%ZIo9pXL#Cb`pRy98X-;M6h#plHZFRn<=8=`B(6~8i
zvY{ve3H24aiS6Q#N3fIaQM(4^kSE`ps&hT6fmTg56<h}y)|EVo;}-b9(Png5+mX2A
zY^=gg{K;PYkLcRWz2=)p*b#F0#f--A@O3iWTB(UfJrUlWnv&0JKePNV_f&<`wVzmh
z^|*7qvXRQ6n^q#E#b^D<BDxQDmUb7)9;*FGzruo#Rqm%Dn^WaG%4YK4(tfbosp963
zM@)Fuj@ej?hlZlNt27Fi4A^KMd!rvq=u><28k~f9?6v8usj3F`F;d8Cll|f^-)MgX
z?nbAmuUw*~wP%KAexot>`5SG+npXExflbD+4l@}0(LZ$RR`FShDn!>WgL;M&9*Dk<
z{1Ef<II+<GZd~4&gVO_##HuWtgT|7f;KNV#Sg@E`1G?o5mW0{WSvpl345j`&l1>_M
z%$#p?yaxVkPBG$OSDQxE^_^u3Q>DrW_3g>vt3MiFKL>k}o1aOS=eK${_|__HyoMW<
z2QC47HGM5aZ$hT$zi+O@I|TsTT3vm1+it96Z6P%t`SUd70M9+hvK?S&a3wVu04Yb$
zO)}rSDa6x|kK&g6dG!C#7_Kyd-r9%euPvtM^SSHol2p=O;-cT^+GxoK+(#&bJ_(J>
z=v&W*O>V2btP=%v)1rP_Njs2X_VRX4<Q4hqn)RRcJVbM9i|`A5YppIaCePB}Un_6O
z_7x_3#&LSOrG8HTom4noDp)mE17w40j5JqSP7nH@2e|1Bkc<M^nVyd=u*AN}A|Cx#
z;(ge_kf{1YWsa5cvYKEo^mT^Ns^vWpH>OklbDz`))%3?*)(fkHM4qA;9^YYvgU3SL
z-kZF<Pn_SF=_Y9MBQHZ!m7TshK1|a~HM7q5I+r5SEeyUG?r}v691v6AKR7!5BfGE0
zIudAQ_o`h{3~ZpDhTq@m`6&VW@#6=uasuwiGcz;qo=dJtJRX}RBu7YJd`AfmeP%1;
z75HtECyEkm6mwO3{B!sB4>K8Uo3V$5ivgK-hk+7(^A#aJ5*LP?7I{udkzXawlv1Fs
zAbqzyNJBgSeeo|4{oa}SSJJLvIFO;KU2MCx$t;9VMU^NV_(Df#3?%Li+R4f)D*5^O
zZ@^^`ybC~2N^0uc+c0QR3aIzg5wW}iPW_=jwRh+C2BQ~c(Y{MUlpZ@^StU!8t&~NL
zDJ(9BGq$HbLe<$Se%#{-VJ!#lApr~uH4GbbOa^%W+-cIpX|5jR$NS|zGi?#6si}{8
zX)ZIufd)d~b-56Lt-s`zk~ypY!huqBP(};3^@zDai%=A^pmPQ$15hYOJ!OWX@d)(g
zji(k?VY&#PwpbE8R81$FIxY(L1rgmgZEc*CkTC6c!hc@GNOI5`I_|@LBfmD!r0u3$
z<g>}zHyt1OD6U-T-qSR;nJ;iqlaA3BU}7?vX$?ao4wiw!&1G?>Q|q!|FwakH(E>YB
z=|w<VBoxp&g=n&_*f*cI+f!hjRC@{?F=*JpcH7V5)VQZb&*;*Q9zkr{Jjd*503nw8
zT~pvLhXBV2pj(iC1dcgQ)1d+&!pHGEGWlIa7}HLC4aM~y9{2ku8tP8WM4}DdZHBAb
z%)c7C%m1ym8ARIv_#ODB#39*9iHLx43XkZySY-y{;t*t?{ld?~e4Cs#Lmw6Qz*W#n
z-dtS(SV5sA29z+QtgGMYx#)Iawd{!-j;;0HnmhVs23n!}lR>^X1=_9LDDLu|o>VLi
zE!{)7Fr^2)_lNZUxy@sg`wXrRzG&FtGatk)j>pCH?$Nu1>}A6A%^oXx=MceK&KoF_
zWzdJl4PVyZX~9v8bi>AC*`p^HCeKL!aSa)J)))-s8cCbYJ<D}kPmM=d0!bK=X2?!G
zIjHxDBTl42_LxxpYHg~KTAl|${bn;K<Hr+2J#auy;Go3aB`*;BQ;NK=WE;`Xv+nU{
zjX1Q5;H$;HAga`}qmnQ>kB+}eMP<SJQTB|$Saf$>%`Ha8`ghmrxlrN+x|&jb)-7yf
z6)6b=s$H$5U>4QLfXfx7(C|%#$9hzIr)S$I?-M~a=WWzBhhG%8lsWt!EN^F{Jlv#%
z8hjR@4-M^{d#^}=mQ-pBnVfWU1kS*?K~FXD{y2iUUwhIE0zSuX!BMa%?`XY^Q~u`V
zz_1)LKy1|X)&YDq3P>}wd5x86Np#;)l5gwECT_U`-siv7TaR(V=geA2nhudp?TzWW
zE1`KADi&#rH1lTvD*Xt2RoLR)mcl^WFz8*$q50=tY`Rpbr~gZ{M5h7_2`^DmwcrwH
z;B9kIk<-1b2CPl^l1AGAP%}1gcWYzFy=sCYBQ@028r5olMAxs+w0Z3J#Vb>5HIcUi
zJ;d-APMedqDo}l$3dLJz(`#daP#k1;@P;DE`<R2(@e|?I($Xwsn1=vQXLx`C$mcOV
zDQKKUd;|gErz>s1(varVLYb6c=<hxAKeI>%;7tttSjuy9g1Fz-H#OOfm-YXta?j$g
zl=;QLm{1z>0F8O`Ij14&a)tBeU<~lQ&;p=dp#0BXwblj(MihCEOS+fdL(CO%k)GQj
z{ouW*lpsH_=;1L*Wwn40Y?gj2y>jUo0O=;|@Ie>7mzn{@SI#c5Lu5N(LZU=HjzP6-
zJ%*S8KFOyltTQ+);bElzOurdS4|mtVOxFV1<YbmuwA`tpNQ1eBH93TYh-jqJ6>e-i
z?RS+G%-}8ksVlN;_}UT8vjlvq!B92wCaf>+oBAAb=asP*Tg<AA#nR9c{C~Kj1_}%R
z0gz1UDMXna>5U$I;?;a@#gO#n=4`Hn+PnipB-o8*Bs(yiyLwxj8n3w1#t|DicmJIn
z6~DUD=Jz)O7zif8#WDF8%~B$FJu~#LI_i;>6n>Y#Tq=J)N?J{e0m#*#Fm3aztz&()
zftT1*n=Gxu)~vwUSLjkZOeE*FErtHr&cf@a_>Wj36hli&1&3=NwpagTwCS27RV_HG
zfnXAn1AGYS$6j6`1N$cTk9|X{d<T@5z&dlZz~VA1=xgZhRelL33Q9>!tE;N|+8~qB
zlhHsf1N|DqafE!f^qzf$%rxRh?jTmqeYOG6gy?HWqV^$EK-{a7Ke^0N=&y^I1LEGD
z2E|Z*20*v+{-t@s`H#!?x2nmnSZro?wlhXRW!L^K^btzjHLnD{ki+A11Od+V4b;X)
zIY&uA_uQQ0Tj<}TdU;RjEbA$HvtJ3BR!35b$W>5S13_&#oxq5R^w~qmJnzaHj9{aU
zwUw30=L_x|aB>iCGuP6|XuK8<$T7Qgx)BKe8Af&gQ%=eLWKQz<-K2iNnYVy1riMia
z2dZx55S%TgG~Cny!1XW-`2VSNd^6KKTm;I%pTuuIE7o>g&wl%mz%CQ`B>Me4WD1s9
z|5;a6all^#d78e*_d8&BOP*h1=e)^|U^H!|T!MpaPhQgs$tqHN{^(6d`VL6CX(@Z*
zhYWBLE>3c@v{PRHzFgao!)kf{S!B2J%|C8oO24Ya_NKzZLcp;ao*G>)ID&Qu!nKv#
z_6U=0Tc8PaW#!=sL-lJ~0qPj~m9`Zd8XU}JSbJ{Guy^`BNUpEBOp|S|Ck2W;wOQw|
zuGa(JZ>(|XZ0-1brU*T$3&aOqCmJF^8(3b^^Xtz}cI?t`cgC5`fZK$LST9F%g>V&?
zFv>p*=0vhKwRS?@QC{R-=iOe<EhTMa3MfWI_#bu#hp2Rk7Vy$HpR626cc;ZEM3t~v
zg&&|~PcJw^+Qek@KQIU`;J8@hiN|Q=tt}n)=n{LrpkuX*lM~M#RPZ=P7qAgpMb=d4
z^nSx%65fx=gG;yMqk-uj+w&zrjB)B!M8b1lZY=^uVYx^IF2d)uYOE10)s69O5(Lt?
zkNOc`Qi`%{ftX>`4m1po*XIIjx+mmjLMEj%i{8f%+{by;C@Eke`X6><ABy+)ms!Cj
zNC3<h)<$8)2+vJPgDfGBXvy|2w^h>;O=#{9u)TaV7j|fHI8H-E?z->#1T>kv`m0S>
zWAc~6=fn(a{r%I|^+wU?SynIHNX~?tUgT=EvuRP0>EsC_$)(JMrnZrUH~Uk*q$wWg
zU&ZNo=pUh`9U(!$aF>_N+*PGrr_n*D{)(9!ZifLSe*IwH$!jpOG=B$D*M6sN47Rg#
zv^)8Hwo_m(&ebeyCk`940h`htr>*DB(Y{U8jU|Lu1TjMPp@yD_CN`~1US{UBerwDj
z8?j_YhWtQ=cRL3XVh1emwKPe6iguyexue*jO3sP{2Ir~!U^e%?EWcsrlKFELn=0E1
z2QRJ@QX|n(bUBvytKGlVe$E~{5L-utpp@0+ABhEjZjZ!}G#JBLZmK(;DCPGKqEnOA
zVh7Zu2(oL|V4e6!kh7SR<8K?<(P~lkOuY?+&W|K<_MwSh6e0CuC<eK1Yylp^%>+Ct
zfmr*G`zTzgn=XTg-D$x$#iurs1_uXu5z4s(edPp8@Vq`hUe3|{t?$Z|UvV97vD`J4
z8oQj4@Y>oz5*~8%>I2?<hPTEYrgQP-JT6q|T^3h67B{LXEdSL_7~!+|rsrKL2$@S~
zUFgNE&z2{DI&*~Vt|6BFk$@(<IS(H>yAKtahhHR{WJakZXGf*z2*~j;m)|>IglQ2B
zs<=Z?PqG`!X~{cdy&scgp$h$R_@QzqPV-T^$q{7eMpOfpp`$*d7<G>2l8*B%#gAv*
zSo$-%H^%c_2mN-!%gO|NZ48Q4uYlhYd~sctb`w^6?nptq+p-X($N&c|!mqb5tH#pX
zFbWC^-n!M)pLw75UD6@M+U2^1Lv7i@p%Ad#_gmR&w89*+G~<#z_v=q)IS*@*iu7)v
znl%~*$P!U&-b8cuw<p?hjc*Fbz#E%1e<De-l3y#jJ)ylgT$!H!(QD|;A0J-^XA_JU
zkH0r|p=lplRc?6HNh}+RV&&i*jG5D*X%>~^nQw_=m|vU)>Ino(Jyil)jCtC^TPQT>
zoD2Q+K&C8P$Wz=Q_N#dFaBqm7%q4;&<Q=fAcw?>z@=iQ)Oygrjld*Zs^u{g0zK70y
zIS{#=Piye6n><}#23jg#-seTI_=O?N6bD|*k};T9Lx))Qp{XDi5`{iSyU#amFs`-a
z%{7X=K>aU|KSCj5Q9=cIpMAlW=?22>#b#POYXJB>M;|J$(~^o-akIc!o_c4aNIy_t
z-d0lmQBSv?g<j!Yf^}Pg05UyNF#l}%cs(C8#+n!L)QM%-Lb1zMU4BsZiS-5gUvHca
zziM{%+Vyul`QlR#q|3o_lKiop$SXN@*y&jcW9)?0odFltmzA5fF)f$lbMg$dW7-mX
zv7zXe7`o$W|K}N6Rs%@)VBOCr?9%Y#8v<F<KBTTQ4K9p^M^&q`bsaff*y;!BhduX(
zEhI-(h?W8dWnRh+iCaB?YG^F4{7v8gInQ^v+}p;6N$dx!PE@|#+7JtATXchxUL#+@
zxAjePeUEE|QSHe3C>VDXJ6_NTx0=U8zq&_SWBqgev0!!VIR?Iowa5DNq#TLHC`W)n
z(-<$WT92JG!xPVrQlyJHvK6mk7fo+uz8Y74{0bpG|NRZ4Jvdb^Z^F)sWe?8H%oLVE
zB5DkL-@RMg*#X0LimM#7k2)6%idBj#vT5(#cR5M;VfU~({19cP_2mU-8)n9-Ln~}@
zQ<j!x*L<}ar*lZ%7Eu!;`KZ~xBQVM-E6;K#6>k6wIYMebjYsFcZ|NRdB$_!Cda@rK
z7dOsEQJ}P8VPkSl6ff)j1SA0+%>Qn5y=I}HKEy&1Y$=gH<L@HqHum@P)2a*(pcPO~
z>cNZ_+qHgR8Bn)PM@I>o)qEF2V6zVNLqzltmJ&nx`EhYEp~opaCd=EfK>1jw%@>--
z=(~zBUo|dWTUvFedhhgls#Y4G6kR;!K6KWHoR5zP`7PRgGW6}o$qOj2625sjBSe>c
zoGos=Jj5pyM?{T;SW@TDqi(gwvXF#?gpQ6*OiU~%88rCbxzjy5`q7jgyMJ&XPMsGl
zG;Jp%Ba>c8bpb!LMZ&L6`?IfJcHsN>-^}k3kkT1HchD$xFmZ5jP*(0`2o&PB{Kjnl
zil>0<SYZyQ?m?4}5__+uDQtVIw-se<l0JB=a=RN>JkAA?7}j7jGI-2M8Wb&n8=TUq
zsRUY+x=hfTrJQcP&+dZLst9a4+0lcJMn%#{cdPXz(nI&$Cma4`gPwoH0Nb5|FKKBq
zUw|)n8R)T&iJ=8E%#^=?Mn)v9z>nqS%)zCWazKYz7779bYJiJ^Mp?k*F^Yb9J<Z0(
zcJ8uq5pSM*bI|o^JD{!tIkGHfq;RSFHEz&NP^^56b(M7HbKSO9KJOBNVG<RM$A)Nq
zVn-0n-pf>c*+FGwIi&mQLA0Qm*cu;G=PJx&JQrrz5RaP3H^MYaFGwxHeGpCTVkoa6
zqJ3z+<1)Srq*Ozt$%B}p_Y(0Rky3XyKOWGH<X5hI2n#C&=!R~&GZ5y9QO7`BqCS2k
zzWW(;P=RS<sY=Z28ymQ|xawdUOmMImp8TW28dbIsc0s}Y`gfNBWQo~@IwxF|Oz&5S
zyz`n7^t=PwstB|nh2%p)E4c>W_CoWBU|);Z8{UTNMF{<-Hbr59C6|y_y8K|fTeGRD
zLT?`(@7$f$(8S$`7N0wUJY@y<k!P92#}fAVd@=&8FCiw}@9ZY*0RboDJ1>NEWCA+z
zRYsySe&g(`;9wroVqp9o@cuochXoyTNObhWsT-~?E>r<h4x)KVG`q3U(bNM|8osB>
z(B^Uj573dH@YQ)0hHo_fO@NsGCNgxO>#WCTm`5-q-98Q-JRXb3yR^`Pay1?*ZM_|$
z_M$Z)<>e+U<jp>GFXKs;E%6P=B0T%exqaxnyH6L$J1@qeIOBNsdI)sJIqGl!BMrHs
zjAF1tKAo?&)OWM|fY}TN!f`Oqy|J;emXa#$;S((_A`tL(^F2vUf>5jHy9ggk>YO{K
zR(I1K*mj{xj~qFdr{HO0!)p_b&1TlVGuF~_w8zf3_MpS#PH5Rr5n~LKfm~h3T$5jJ
zH3H;`8k{2_oOAzv@#~kN^GH`n{&4};FTrUa>CBr4`Zob75I$#JZ7uNsC18M`A@pE4
zz!JjTP`J<Y($HMmALW4@`wq~6t<qprQO~z)8-~}ihp)qpc?Bu*k|IKmmuNJAMn4SK
z(t121LsKg^zn%U@U|5qc2*#Q{P2bG#hiu;KhvDM~C{erxaFMild8cc8YY_kAi6rLW
z#vO?yv5%J%vr#dEPG5&t8|mQyc9f+)dplcu@EiN|E|jqH!jvNs$c?{tAlG0{PA7bX
zhoECxFt*bLF^75MpOo5uFM83C?y@!Q_tCD0Jw`kG^RHCq%xv0+w)+zq`s(60m(XxD
zg}#|dx{WTy1#thr-(G`(B*M|;$#Z?u=UbCEAZLys55Ugh1FNgK@#<;E+Xf_5?LGh1
ze?*~g50;;l(j7NAaA|1?G^pJTnMb2g50jN|hhZ?cd8@cx`1trDD#C(-?xQWg6rE1D
zzU`w>`|@zDiQS*ZtgFkDzdk0QQ}0^r(<9FrXN*7Xh~G4yKgvR@MQ@TN@FUGutN(_z
zA5DNFwe8zXgqi)W>2YIP{X3&nA?wm_lb<d}qw`|i`CaoMZd)z6Lr+vZ#m~l6a<+f~
zFq9n&&*tdih}y1S>T4=5WB(7=HUyC78U^NL&D6u=$*lsf!)56-zE>0#ksm_XJQRwU
zulNDzwXamm)GBoRi9n1ic}9@QJ?e$;3E{7{tMhc!=+w0bq>ICN_hYw*2iMkB>U@UD
z@h<<m$hfGo!mT<FSX@a7O9Ct0UQR@Uyf*w%p%+>}RcU+u8+)CqSKg!0kvWVkxLOJQ
z7e+|>d)f7mQ4|Eb8e1wjH!l8TAN@+_2dG_ZJG=6f>RU&QJ0AL?_+O5Sp{CM)UPp%L
z!we%Pv)RDWph!V4(V4}9qIE6tzH`dX=SaXL$^|QG8rusb>q%72;fs%)Vw6AMZWecz
zK88Rra|`21$v1<k<5YN{k7m`g!`xHj$;_5L;l7z5>Er*PySp&?+Ff^?=sy%0sqCv$
zti<3QkbmW3n=p{Re~q;9@mHvRBx-^!2&C{#nB0wBz)8nHH&&c{hfFW!|5iuH6*%QB
z{mOCdgiAI@k4$a_>Lr9RKnRW?o+UVc9OZrF-Pl!d?)wrF>_hKxx0KKNNCn)ZqK>~-
zSyffE7V1|RHzBNn3XN4Oa2bxvB#5Y0f(F54qqy$=h~v-9d=JsPOeTtOd3nz`l(j7M
z-j|DcQBW=k8QGzzh&7Um^u>#OH!;^YVV$`D?dJ=$Y@R)PW@>7xqZ5-p`t<z2KL-+Y
zZ0tu{w1!z@J_>6Rgiy@BD#1Z*NOq*e$j3jw7w}a)CO&Ux;;xlNVFnsUn=zlCAwxqq
z`kpJRli)l&hGZXl&K~!VP>?2ig%%ruoaVmZ2XbcKd~P|vl(Mk_hN-cZ4bo)4t}@(s
zg=A5SNMXb<(imPMH;Yt!gdli(7!+-o5Om=PxdA>Fz3qRl2_V=1R*4uLLD1tlh&k{J
z=&KRpA7}?g?jIX|6;te_<znYOcP}6e+|D-k(ct3!0%GMN-<dz^tIzijy1Hs%+&745
zXHIihSrLy<cQcLQ9mGEi0)sHtQgzbiMo!7xUM%fxW)-{Rknso3-v7&gcm{kGE`Qu9
zR<G`NzSpkN2`|n)G<sO8AZlVq*i;voE!V2xM>)BK#Z7MX7QPb3pX*bxRD96357~?+
z+ws;!U8eq`_2DpMn5lr7U$kxbzdiRWaO5n;;2j2VR^BmNufL_m?vEfKBir7C`o65Y
zCVv`D!pS)kFXRV=WG!tJ*yM^H`t4iUXW;`ghPK04cmE~@7mWe7x9i+hiOJ!q(g*to
z_g|PU!U>;C-^@?edeDEJ<DPO@2BN1wb-?DoEC}JbO1=1q2s7tcr$Vht_XFq7M&N3b
zkU-DMTHKEv+gUY|QV}*Y+TRyN@tdiooAhAmZ`6vQjHp5r$qJD2E}+~J7B1K1FqiM}
z&Hrz|;k~-L`sve+e7?Mbf&y?q+w;=FoZDT%NXcIp%I4yX%lx&{2|vBeS?SYVX0{8h
zulz6n+y5lGstI5=9+<!K($%%^!nCf3hlh+fZ-~z!=<>EG<n^jC;Es~9f8l=x$XY?l
zNF6Zyy~fL00a(+z`0JO36*ORlc%24%rc9*$MW{qDI3P1ci%Ed2qh1Xpt-dsWRw!)v
zUe|BlO!<FHP^sG=0a1~yEk}!@CA|P>z^yqKl^I}tjoX-TkLK#tFp%k8DtBHM1kJUd
z&#$emfhh{s=?0t!+smPCGpBUdRG_GB`aHl-tlNi*ZtQhz$!4>^3oOhd+=og?fS9tz
zJ3`(siPY=q&(_A89gcVm7Js$V+ne`H-$wGE`^QxZ#<)`nIDD2$di_NaG<$$C4W|Q4
zRaLKiu2~3*04X&1-H+TR4}r}LP!wDtl0+a71_s|3`_u1obEhR(eBq7slgyKpmX7DQ
zzrn$Qgre#oSI+Kk<ZJjFlYod7yt~vCa%1UH;^`;cf~{+&>x`32(0;=ncRk$>*@9ht
z<S{?AMK=>YnI(9r-OtB*m(_M~Y%}^>(-Xwpk69JC?criMeZs*-QuSDZ&+LDczz{rR
zlES+NPpX@nqkTrFXJ@6Pq(V^lNUzAs%O~3hS5{O2cBJ&0W*W(%0+`RrYR*=VW?^Oq
znl&BpkyR9hB~OJ_>^dbuqU}v*xA$Tr)Q*{0wU4&qphnUnmhLQ@Fkj#MAhY+=IP9Du
z?HaJMTxdh?IAa7$%;KYPvCHGh?B!KzJ<rz0`^NEiXm|ad+;xT`+h!n2@SOdJ6K^=G
z##cPtBgc8ZUCXT0bXxArgvCzBmXmJH?~EC!LlqA0R|$!FIOOK5Q=s=Asu$yxC;#{z
zCt^-80J#_#5CPgjq<U|MeJv=sG64o4CrJkaiCad;<N@@n1=}}MwHb|7n41)mR#r<{
zTiZ(BGztLeFZRBW8Fsxzk`9A9@}mO7X^eohuwJd6TXd;*G&L)}vUPZ9IMx4T(PuQz
zO1x(#+Vhb^SVX$<Iob%T8OUhJ@y4MrA~mPbx?gR)-(%38w`?_|h_rt%tWaNPfu{%_
zuZgQWP7cBf_xCy>r!N7}#eb!?g<3$RV?FHA74YK#8W}1H1S2COD{$*pw){41J!3`D
z!L<V$Xa$2f*yVnIq=om{5Yol_yPR9a*It;Nt2oCge#z0_y*ejBgbdd;;Q8xIZz_rF
zy{cK3_hdMMBwJ_LeR4K=N%-u{gyGS5lkvnwh4oI3n5_ZGtNy$$Ujf=b_CO*SY9$9;
zT`(25f?z;3#7Ke@HkfI(NbYb5@z#55FGy~g-;XsmF%jPSrjnyWF;09AH*dY$dg~|3
zAjm5xVE#%z7=Gq7SaSNYY!Gxinlg;v9`nJ6Jwi~&@xg|6cqsCdhVFjgvXA`RbdVL?
z5~2}eSUEoz$78`jgnFL7KRaA#f-#2<^{!1I#d!~r9co7wGXWa`sBh?-c1~-{D|ZV;
zZzZ&cf$^4#{phSx7OS{v$>WJqC!dF(o?FcDrDu`qZQY68#=9KBKj1Y^<}z&aOmUBp
zbKs(8-=#=uQ$s@KI?~3nIv&1GJtlVVJ@d-OU8OLCB17u~S*?L@Opo>UKO(I^q*qkb
z9f+45;I8AG7vFUi+FI`^B7JAM(3gu3f}=5X>yDCt!RIDi2f=&z3KU<ScQ4iP$3upP
z`1TPFrfHcBcKz2f-@M#yY+X;qg?aXwFjqJ(JL0ws$WD~1%h=_poBD*{#9VGe4D<U;
zJ^LfU@WGQp-=Ch^-`_tSZ^O91gUwd{^5G#9tlTdkTW$ihaWUC3F*0@+Ykg%0G;v1j
z>s^skmDfzJOyR}Bz2gJyElw3><@Zua@@>hZi}2$zBY3T!=xBj8Io=WE#tqB{>}FlR
zLx&4U5OHvJf+G~WXf9KO@qH|Fd$F5guA(v%*T_VxPL3u38n?k+b)BS*L*8=kE(=s*
z63#FFsQFPJ>cZ#IcBGrdZq06A=X1F2xFK<#;47)JN!3nj5=4dxrQeUc$)cOf+Q6}=
zG^IcM3`HMnR_s?hi}{O@#GO*koZWXi>p6OlU8iFwpKV&T8Zc}T-($Bj9J|c&fV;q6
z<^yJ>o=H22y5{_(wXx{aZobW+Nukhl&7kna;i+ugzG?P0(>wM-|6;0m!*EC2;IJKW
z2kxZ~%=Mg1U|r$D(D3I+JsnK0RX}JQfLT$ViMij#NXqt1Pg|Q9K@cu_OZy5@1E7i*
zblCF?TUNf(b_zfG2`JQ<-|MXrvrY%IXfxH)Tgs=IhR4Bf7Q3$bZhR?e6aeZH5<`5V
zOByXvdsDoODV^UHO9eACD+Zo<F@kz%UJ8l+(mQyMcZ`yH(<4?cFL_@XntJ66(R$E$
zFv<(}z~$x6cTDF`pf=Tacde~2$&h&dG)K;RavdUZ1>tj9hsvPIWbA8|i%4dS<A^N5
z=muB(invg7c)UZWuBdC+cADXkq**+N9tw-AeLuLHi^9~1-OG`3wO5G}Ew+4y1uw~>
z87b$J?!h(w7Fo%|o_LUVv5Qh`)XLhf!CPc!d*!Ta>0_5z2<hv$xuISO3NQIn^+bqp
zKfDy3QiqpN>v(m2I404WT3-BUDn0Nj#5@U|nA&_Z0_Pgno}-Yoi`T;`AGf#9M@!)p
zd2cx3<_9g0(G1uN@!XL&wchc>Lz*R&&l_(*u!_8%RK|hLm|d1_MxBwYTRwdU9&U*)
zcc|N@g7xO%pD+BzO+*4b2OkfwFghSM-XzgCtgt2e3G&U8uC14qK#sH3_DSV53(=&V
zKh0^MaE(GJVqZldS|=>7ah?;aTZ8rCs*w<u*gZ15QE`qzd|iF<s|5>l$Oo#>%EU<P
zTf+d;zh<enzkiSe0IR-Z-&*Xmw+|dc;46IaJFxNd__p1_(9eAntM2UY*Klwj9E>7}
zeY(Z&#q5@J=gxVz^=FD8<{GoVy)B@Il=g?;=gm*D(?~;GSV#pT<cG7jiZ$iy=pgMu
zAw0xTYz_i^&)em&_it!>gy~}2Uot5KF~AZWno@jw9FL(#jxxgPZHtRMg|y@NO){Ov
zTV_wDS2at;QMY0gqiA7UumLwg)th-f)$t`}dzU|>i>n@cjhMB#{BeF;5FhTg0)7MH
z8QJg}LGWH~J!>!VF@yZu#yRQi>};FE&rdT`;iLmt<J7A*dPP3_6y02ZuflNi@bg-(
zj>|@?5)QW`DL>hfc<8tsYC~Cg<ah|@%mhdu@5cHiiw2=i4`yW1_pV1j+DmZe4agOm
z=)SiH{b<sfeV;e!gTp<l+QyD^AKh&z=9b31zzA$XQMQNn*D{K}UxU2oz!5l`g~Q$V
zo!gW_aAT?9JUCECJdTai$AZV5m=2fk<v_~G-hafPq*uDPLvK_OpPkoy6ZoF)1BfNJ
zCt3k_JE(^GmGLz<GRkascD(I1wqrG{s%^h8-$yYxu09Ci2ZYG^{n(;c2p`hZ-O1m2
zT79RmsOLR30c^+g8v%OTYOLGid=|65Ia>Lo#fx*Pn97}YEiLvXcr|$oTPGpx7f661
z&mX=heccUJ7tChBI%Eh9dd+@T?0Na#PR^ZU&+s4T)J>(c?^DE%TnxHN9$uc8!o<Gp
z-<7%eu`xSKM-+awks}x4iN4f1`HJf`7t5VlJpA=)N+#Zzi^=RuW-TitTn-g_irX{l
zwp57e*iI4MvdyIwN{?$zaorZ0(DI49^lLZ+k<}DlQ$62@CKECa9q9>!@$7M*;X6mY
zag!rC$tri+6FlcIOUy#>Csu4D1~w1@ftjgkV?#q}8GATr+6tGDmY$oQE-Aa}t6~2E
z{hFPVQC>lVFE*aw5UJBsIq@PO_7R>7sM-pW@8p6yI3y?3+FMp5#!!TP54!sB^DGbS
zDX8~KFwMgmeCusIb6+SQ&>~llATJ08OrE8Xt>G4~PVQczwu*wjuhSGJ`u6I5RH~7{
zv|kuIj<uh$Tjr-phi%R6Ah8rBd2v!74q3>Q7rp2O8JuD~ypC4lwzW9+``eH<hhSV3
zb{NN5PTEs3wO<$5FC!+N=N>4Lsuk4CH9(*@NXkq0t_awDF9qKuLr0=LC0F6u2zz8r
zk=GxPxpH;D5%NM4(RG{ZTrdLERs*&1pm!SU*9M+mU;uxeRE6ZF!>q4G?)8fj!w*ro
z!xO{BksRHSMuM2&^sX$|8({V0dGT&~7SfmvhC7A8w;{`eA@paw&cu53c4Z&!6R`!<
zJNH;L#i^PzS4c1j_UYCU27}a?k#pgaf|HNc{3|jmt=gid4D@TUYvW<U!gO1oYRpex
zhzVu1{cMOHW~`6mJv=+%BcESDjV)zLdmbFPS>(EAC&=`5=N>BjCYSx$+u~$usyz+)
z2~?W}-HCeOK*+o;M6_45I&~js`oJYyks=Q&wbY$`8I<O~v8%tT|55{A=4W8N`I{V5
z!Z5ZPw3O1~Q>VKkP9-zcb?cS&q4JfFhY*++f#<Hb)Yp)H&a!<>*{#|g*x{Ean9npr
zXWd;D#Evb0V9NDE_n+OKJZJWr<pJ$G1LqzqwZ4LpOVtk!q4}&yZO5*aIpGij(_62b
z6ZRA<-9B_`-DnzWD290qJD3MTz2Ni?cjLjwDG-9co6(XK?>{ix4Yt?FZ`5S>C%J7r
znKW6iRkZH<RQ`~jQo^+8Qa=0SIeZ8HKM*GdykDn3r9PFng2dtO>ilC-b!wGM&igE`
z;{0@g)>d{_t|tU^95c^NOd<d+g{O5<<N5>egu(YEx7zn(yJh-6O`8D>V*W5AVnB51
zM9Jgm-J_fda@Z#Vdp7Xqca%J#4rjgO`Dr*kUTlvnybUiK<4&<tEZU*&s}R}u@X6u2
zqMOC~<*gEnF2N4w?sNKj2h%N*_YDcKMcxv#!21HX#^lo3g9~TLzHYG6cx0p^UtPhj
zCB$^dM%&LM5ns&5AGm_rsmh8dTAF-TeX*zJAtOl=z~i&pcTCjIhBRQbiy(M6o=~Co
zqGD{E8FpjC$#n?o9{AYzY<;h}9NgDCg(Ki06Fj5ngIcTzBQfow6X<%|<>8qF%bT5<
zSXmzIIQUYfpesZCCnWR=iZC;3EXsyJxhR)Dc;Pc1a+v}OCzH+G2LPn|?)zv5$2rcI
ztS^NH-G*2>I~mR|?Yx0|)Z2o@@s3m@#yonN<qhD(8HK^eEfmPT1576tvl~jmoI-#;
z{#b>#f}_DxR=<>wbBadG(&c3V;DEi&Od|y1Pl-`U0W}BHzVz}I?bTkzVb_IO<`1w=
z@m+w|<Mcn3kD)Nb-JYg%F>{XbpU*+)g_Zw#xcx<jYHIkTfBrl@4oCMdI8IDV93LM8
z`{U~#nHghhsJl*`o`4ILy5KRS-IKmKWS{iqcr)=+4Ui7145*><Pug(1giv=J-I~2~
zlUmv`1<I|K<fMs2@=W3L@F3p5=ukqfrj-9-Lv=QCL++~LUk6YmGW}KJjUo=F6^$|R
zm1gTG^_i?c%5vm=LQ+!B+qW$jq*Bz<f%|Qy5;GXxDXDk7q<;cndLXq{Qv<}PX6*KM
z9VsNh&D1Z$g}w`*@PJgSYrQU1v>)4)()smn6oQS#Ga|ut;8tZ3qsTz3`{ekwT$;ad
zpsCxoPtZc~1CC}?oBMYnO2W##`axl8OAj}On`ox{N0)Q`v4QHO)BAbv-Fxxk1u!eE
z2?zaD)z$ay?ChdnR}j~k$^1X?p=YOcla;?He*yj<i6jO1tbu9lYw)L+^hS>Yk2+~k
zUI+sF>wren1_1d3YQ!CC5!Bd8;KPXdtH0_eccT@G^08o^Y`zn#Bby}#!SZW=J|r-o
z1MV9@`2XQyvO+2ViP~CPxT554hqbn~EdhsLu;d2XHy9ZDEuFNqMu1KlyobZ{=duW8
z+@{)9D~3^@3l>^o41f}3VDMseULJ7*-E1<Kfhw!7uWwillu$wNKwyHnKHWl05v_LK
z1yCb<H(E{s5Y)<0W$|i!1NsBzg&o8DnOyvXS+kCpFCA*H8QtbQDd1n=G-l}RdN4u&
zs@+qZ{y$-_8UTj)(qsO=(46IV=qeVK6oAydr{PmbR06^(0(|@|y-Gpw#6;2ImR?^p
z7_}W8H{V^p1?2fdYcS0H>-6;W34kYI>;Z-yK<6iZ2?$v}CxkVb#G^~GlTGhaUcN~c
zvbFsdJ*c0X_aXBq{GmyF$!U(WmmLu96)xkJRu9+!oHs(a1ta{)c1l$ID|lDzuZ!`u
zU?NBmEk5z{0^@fj-Xf&a{>6A^ncv~go1NqZkQe|Br#U-tB>^g9p<~R+$(ht6bL^k<
z7vs4tN<_Q{|Biwx_eb+sB@o_|O6}7WkZfDo(*%-)!x#E%Aj%^+^$e1K<o08QC@-bJ
zJh<GC;^W<(%GLE-eZf|t02negD^*#(;*mK<F>ypz4j|8=Q%IhSbESvf$@NriwNnxV
zc*+<4&sM*SeN0i1IrmfG1Hro@XK86^ZH)m`u#8r+at~!Aa7W*q?-7he77Dnb#CZbV
zjI@8HZA19jyc<5!N&&HJ+(FF*4udKvRy%5V-~BIA_}MfW<uxlyfPI~6dwj=FS9<LS
z-MDfw5ZrXz2NKIhPQlf(bfQ3W%_U<j7>peQmqj^lE%rAwG+ev}e8kt2BlLdIFjwlK
zfO(!;eoy=7wwIgpk9!&3uxDzq|Mk2ahBBd#EG8;k?l=&<x|(5p0yXr(Y8ohzZ^2Bh
z0O<0<#U+56$FH&=b`jpGSF)RJbdrJtztC7#Rd)^k6+QWW;B@7i(eZ6n`40LocBC;t
z|Me88BQ8o*lt>qUTL$72$FZpgdEenWZ`9I3YxUcEXyAzRvKI<mKVp;}2hL|2g8}!M
zrnvMWgk}IB3-$!9Z%RscoEB9PFN}tDOG5#(-Fs2VWxOoK_Igu(_nRy*3zhwwX*ya~
zMuwgQ^qeOApqwU?*v_3jg@nYL-o8PSM%{6~=MuG}*#m&&J&&#!p|$f?exrP%R(n72
zCNIaORUE2puT+9vrXc{ma0Ka<N<*(jQrXHMudX&62E2RsSS96sq0ZN|`BgiVMT|Y+
zOZO)a9<+i!b7{#pf=6!G<<U_mLM8xK{l}ubLJ%|(d%t-T+<6_1y`LKF$X{$b{;<sR
z^GQ@h1VJ5<`6|S#=8kdmMG&<6pH43gnild98_;7x^u|-QvvD`U!N<}5mSUXsk=(ss
zYecLge{rv*s_HnuON8Ow@zFsjD^OzO<);trwoZ?ajp=CS`zG2Is?wAigUBNM0=`$P
zJ`FKjW<<mw@XZ19cY(!~Hs$T+YZ-%=iaNVSPG(-KpTME@LV)5QWIyTie{QO&32vrS
ziu(zR9rm~h@j}*lXa5sLrY5Z}*o6;+tTOZRLQY)0GWN(%MGl;(!w|-1r?RIFM68vm
z?Rr3B5P2`!A-R{6KEWcaSZ&--<p`o07ahI3zw|uzv1lz0Kz{_mX?$F;F($m;35wd1
znjb5W8sV~JTTm<|GBgsuyKC~!F;~V;ej7c0e*SB2@~*z*NUYa8)oGkffVtlsJz4b;
z1w__kx&tkl2(#_Y&1v}F{0|Wf=Sg098+%_c-W}f8XkBk>47YH17vsC^s{Ph|0&@W7
z0(n~9gMA-=a`v;D4CUpj0=K%kca=iYGBe*ix}a8cL%X>Ubap!knzdTn<dxs{*|>%W
z6_+L;u%BSENjT}x$qmn}soDj)3Eln~);b?y0~HqSBgo8)Jr*>=Jwub%KOYDwAK|P+
zZCflnW!|`6mR<D-&jB<N0W*sLa6NO&e<+zQJl_2*fbqct&jo<Y^em-Q{~ylYGOP-%
zYa3NTKxsj`K@p@I77a=_k_t#8-3<n%(k<N~-5{|*x;sU>YtbENE_FY9@9%lv^PTJb
z+m~C{nrqH6=eWlh_qYcU(TDebLuiKGnKGmQsmdZDVH_VHbNqM^E({&EN}0{oI2UU#
zJb5yXW_|N2+3m~}f7a>*Dwu!eY>9(EBj|XwHF}#FeOs`ldDPNuVc*|mle2<mkXB!>
z$hO{+u~6IdvM)&2-{LeeFQhpvphTNq1paspzLI2=(+me(Z1@|>k1^y6+Q*T!Y0O^}
zMRGEtVPHJ=>;`?%!nf4>7Ma&*<ekGJ<Al5gCpqa{R(gPXp;|OfH^O<LMIwt!`*{vI
z!>_=Xv@Eo{h3G@)-*xJa(u$G>`e?2;^~!s-SHV9ozSnv~#2Sv5tZ%Nb)=>r`XcbC7
z^hx>w1aSJenHz5(pn_d!eFCU)3(>f^xrHw*F+RD&4T+LI8q>Be;WcvhG<S33n--gY
z45m)MCIm&_4^6l@W#E>w)Tz>6310J{!)xA%h_aX{1d1BfcXhz~n3QmNyR&Hc#gMWh
z4LiG8v!6xjtFerbWujjH>+1#!ner(xCEXo7mw=Z_oOS@ou)mhMvJ7{bOT@OEJ*xQR
z_ZgP7!&SZ7sBWQhW0}39cNA5JDnP!z4AGs8c4Gn<{k*0PWJVHbb$uR-qdki;zd4KW
zTy%zGquR<IqJl9D+Oc;5h1&FT@V&QvCF#(m3^aZ0?AZ9_M1|2cQGvOycWNC&c7I$0
zL&umz0n;>o)Vp)%CKKU&Ly6JkVYQl*keQH7hm?6CA9qJ9=ZylvWK}H@l~e{HMHJHb
zTSNhBrdMBkejh|)rJi|$fjUUU*rI3^;Q1wfkL5hwyR{SOKtYd&a<^I3`v(|X@a_V`
zJ)KpX`0CZWY2SHTtbH|8(<f+Q5f#Vw0R=G`pyW;P<yCEXdYULY((u|~|M=LLx~d1@
zdUc*BmX~iCb#syPS3mUIcst$MBm3!deClWt21amjNQk6@F2*G8eO{@E`;_sT9}Ql!
zU@xGxEIWf>#2i{ZWNFjCX;O5Yyb9ZcJR^3teDdw3ij7p^{>I7loRi=i0-puWTm^XD
zOgW@%85IRP<_U|*Ev?=-g_ovk-E0dR(b9OJ&>`^>|G^W?>mEAb>$Q#`s*%Evc5ihN
zLJaCbk93)yMX)AZ-1Qk*t{Ilvw;nuKLBFWCbBt4V`{HES*z?=la%A~JH<t(7KdPDs
zXxzjH_=f_WXFaQ`s|k|EJLN(5D&<@N2=(CnM@#T_ksm^K?gt?_E_Gnr#lJQ4OSzy#
zcMcs)mjJu+9gVJ<b@0}UrD*BO4=E*z#2&AtED?}6o>ftD<2zjjFQ-_u6jYD8+~7)_
zk=8dz=O$z+qd#Tr^<xBPhsPv~WQ*XHowf2=F5ZqOE=vqKN_pLRrAOzlq>wbA4sXjg
zOZaMk2(&E9-gej+aNP?%!efJHW^E60-7g$Bmj-inRLp0mNiZlhNtB}s8Z)xFw7xop
zGBc^__1zM*O{Tl5Z$8)Tc+8gv=)1XwaG%`bNLfOuI=OAR2RT%i$jQjc^5Y+=<lydN
z<G{c`&o>x@4UC0f?&_hZdIQy@oZNT*dM=Nx%$!M}26$y6Vj6#K&bP!d1ti;t(XAyD
zO|{+^jrYV`F;+Q#mB;*S>;PN?#lTc<qH3KvL8DZzeMjGCYZLX_0Z_5mn^pTMfEu7D
zqdbJVIreB5&={1SEIl_6SR>|A(3pt*{+;oSqphgur+hje#bFwTJ{E8c-$lf|klj`g
zbcm*0hEy+FqjU`1Prh)5Hn?vrr)O}_aIaiHC4W_Id%TgT?abE+arZx742XYP%gmAV
z75d6PA_Fq>Bkb}KZ$ZMwT`{3^akM(iI;kO$^cuF_(HkNNe+)YgY9N>GttsD+e=&|+
z<Q6MW$_9Cu$Ji_e+?USOj6SyydZc|pTGQ^Cf{kd$o)hWxFo<^#Uwy5T{B~`+eN?6{
z^?q1BZgt!o8hdR-ksX{cp+i~Ec!7;)C%Q9Tw+KCdUA7v*DdCsa9YnjCN|@;R?a)P7
zUT6W7U(tPI3KOkPGq1ULvGz+UId(o?1xTsD_SO!~o0&Np9+-0OuMbR3G?!er+()-{
zlzD^-_nn2P_r8f;ew2hCC1FfKiy*XlTmi6|T;KZfYA*c*Y|UMfUA)?cQmq0nAW!$2
zzfL(94IRB6sEqSEiqa7jizTf{y)edKVgnxsX!#QN0oa_vs1G}W%?i=c(E&2o==q$Q
z;Y!<=Qth0q62*AUsNt`(??PQmSFpOr`YD|9XXk5)$6z#%>J?vJCO%i`E8z;&%_QQo
zUx_Fw8KwweJ11?T@;TGYCN95V@;04>h3;IPmG37tyC1n*vCN_hSJkaj*IVf{V34|}
zEY_3e3R7~b8mkq<?ttCd9sQuGM<a8ByfiL9+Kk5~_oEqe!o%pq_nX-0RxEbFg@Ln;
z5f1`my^P~SS(C-OA5t@4DhuA^?QB3d-Ga8|;@VeU|0G<a_aal%)6B|b8A|Bi(|5Zo
z@CEmrDJZu`=>8{8a#jz%6Bh?6<Y#~w`!R}E?QKAwHofMb_j(J9_}XdpjMRL^MC4Mk
zZ<-Z6IuyN1l4A^qeILK-rQ(X;DJ8+X{f7r{<Apcmw1)mHRnf=Kui<b=W{czrA4+6U
zF(?ioeQ0WL_x}a0n_-GI2AfmCfXIXNp|joOL895CT4D|4#~OpD5I0ZY^AciWNJ{r)
zl)x(RN=_5M7Vr(7ksqEHySqEzj&JTud|kT?D#gw&-r;i4tU!k&vt$w^tI)!Rao9y;
z3fTbDPx4SqNdkR2hzM)rlh~cEB+k1r9c`c>&i@{l23@|zqS2TIe`^g>Le6a=<Y8JP
zFJ-|tZykODmfZ}<67I#E{}|yAu?=4Z84(`F6dw&zm6Ahb$xxh3kSgXy<cSzlcmp}w
zS8@w?%nO-a9K1(wLfbA9=fRhhUYBY);5GGN|MkzBv!?)&hH^UNo?{P7=bV1PmfBs{
zxK1i})A#*Y{Vg*bkR=AN$9-|Juu_KCdF)T;CskM8K$~?-@-h>48|9Lc4h{|`5pLTy
zli%vUeTS-e4C&7KZu|NGMI7F9%G6(t92tNYl(8+0$X!O3{JdVX)`<~P8M)|8amW@p
z3~aRkjEnS0c3mO-vTY}^GH}{O;jpksotEDG^r^SJq@)u46~*tC0b*>{&e>UfX<y$U
zDcQjAZuhO(o9)JTo{I~{u}6@wu++}Z&pa}wsk3G#pc<K?-8Y5R7mCHF=!>r2Bmefj
zm*DzY*;6JFx;}np)J9W$0<=4VX%Te?Z(EQ-4NOX4j!t;mW@AaAwUq6?Y@UY3G%4EM
z_V0~e>tB5M6NbMhY150ZVT<6>*zwFwt=;~fW{m3u%@CcSPO#yNLn<Gidm*=8X5Kwv
z9NNG6Mw8U+>m&+~F*2g&X~25b9YNB*6GZtt?Jy0VlJDX#e<}kmq{Ldlrs$oDru(QL
zny|>oFot7ZLWH{Apa_`5mnQn=4rq}tp5St{g!Clr-R{|7!T1g0-rZ6m?bQiidcSO4
z{y2xXRS+5xuN$g+>brCK%=%eT(}zo)&lihx?+~;YjsBCf(~}6nAU_7AP08<Z??Gn`
z4$%oZ-uwLg{OJZyNKH*6CX$||-n!J}Pbp;w+cr+~u>^isI}men9NNiLbZRN`?rwf>
z@VxNw^wj()HeNx4zI%A+6O<vE-KOs6TX-t{h>;QU$f$O3g2RA{!9rHPPb<~}P<0~X
zwSn%W9*eI(^O$MZ$+o*mS7b@4F>7R4n1oDbHuK5JiSKUD2&^Nnt+USvev!)A({>?n
z++4On^b}X}Bt9q}Vd_0PNwX#r@d%np3H>hWScuqd>|5AjldKGim>hoO#gkjA3^`we
z_FkqKmo0{ziX}|oJnKh5^VxfaB^=+xYfT{zI1Wu4@i9da!!DpUm$1v7K_-u|mMTIm
zg1N3nPOI&e3q@c3RJUbgEpao7*(^+deLK8zFz0z>^jK8YF7`0R7)kvz;{G!Sh6If!
zs((0WrP|xnx`2(Y<9YOAUsbv?R6;hUeRkf!$t!xR&^0x$E<$1>h?x6(3YprpqNE?W
z_oXdp1WgI$!Oof|keWO^bWr5iDsTdV+`tcSv27Rw#}C@CP!Labzi-_z4wNlhl+)#u
zk$SCWMtOI3sXgUirvBhh9Ox(!5)*bP<b30KTC5h`1}GJv%cV!NT%6am@iJC(l|IkO
zTT7UWjPA6<Eyl<B#_7YQKWOvyI8FR8jnd(L?SSpTSBPVU!%~88C0l%1lh0d9TvIdP
z{%I0Gd)-qoO3QY~$VMf4Z%3Nt^_O9JlcEqS!VgV|@RTQb`m#+Fy6+a`654WsbU2=9
znj-LLj}j)BF4wsn3_l%-yr7T19ss$00-TM<l3L*HyY%JoW<hy^)S$vbEYkLvAyDXj
z)#(9r;#qCuZCeH;+>B!%>4@{8Dcucg=i~LEOkM8SOd}&|GsO+vqAO)}<!#cbkUE;=
zks@_(N_YRZ18SJYgp!O{@+mnhdi3<hiI2f&3(|>4h%KXxQE-eMyv9TmK-X+FSz(UP
zK-lLs$<ULN781*J2*HYaJIlMoPj%3a7?!;aGJLzpD6y~1J{wv&>hb?_uKt)cZ$RwF
z&(FW69_OG5`U`R~9}#xc4#Yn)>v+B&3cXZ9xST@Eh%sghTiC^;6IHLKzL6>BJ+o1Z
zs@x9%2@Zd0=*?9iZ|b60X;ns(BGE(|3CCP}xEd1X)Dk>1(`Y*nP7(HFPtbz*ce9JF
zefrF+6KDaR1ay&pZG48!!3(d(V-7MK4bEdc_(<2?+{SNKWSSUonbsoW872KYRT-}4
z!Nbob%hrC!rW_T$#oW`zDD<wLtX?Q;)Ex^C!f@kF4u~z<*HTC|+bThCun-$W(HCdX
zkKI|y$$4iKkcHOY;=9eH<R4{!E>jum_|`lsKo;ptI`Dx6;ySqt<0Fdb*itQxzV)n?
z{Cs3cqUYnImb0YB0m{A{a#UKi;{OCpAXr`EwF7yrIW|fEje@F~v=VFaC6F~E{SIvZ
z>*$;G!5qaWX=!O7P052fJOX<XC2dy>6!w8ko9Q}hT-neVP_x%JFwpi~dH&1(i$n7S
zA3yeciORBYD>{MtN?9#86!lYGH-^I7QExq;JDpO6BGn{4W6vv_5%KIEefMVmLkJwr
zvT)H7oJ*XHg5@`<L2)8>r*^!zggkQsNW~GT-GcEv^<7ay<h)F7Vvs=#&TG(7M62NO
z-YNmKuB?|V3dMMizp(`!RsK+~=z(9Xw#F-Y8QTC&PXi~C`|WpxSINKaIU?toLmTeo
zF=#StHu?!ClPBCQ2+A@FoPBpxw;ryL&oJkLw_m!a>PhQrqAl?`Hi~D<mD+KJ+u&_G
z3^#)nlz>NKh<2vGiD`@C_rJdy8vfq5q3`(6`#}O{V^g@o>e%uoi_XUpDSFD<EONqf
zkS`JFEgE%cb4b9DipNl&t+7R(njMmm?GKrR-4i;jSjKs(AwJF+1zINm(eKy%eF`V`
z<o00J#IsF``ws)(rnBP#R}IX$2}MO@SlKc~MXC}E@xq~%Z)t5US#+LwCrYw4)&vR*
z7i3;Nc`>vmIT*4T(z-XR_^UmoierzV2jir*IBJ(?n*xxwV2{XOy9OG5#zsIt+FU>f
ztiz<h&QTl7RuSsXNaM5aOu@!V?24@CK4)w)Y$Hl^Kwak|x6v#w{^p?#7Rwd#F^2N3
zH(3Ze6f9VbcMk$3MaC$K^LUiDitk-6_vhB8Qrq%8JGJ%YCFBtao@~uF>}4MW)W@Yj
zrxG-QgK}I%gB=%ZiSv?h9Oa{H!s9wy41eEY3%8k5a$Astp`j&hHtIde7zLa2--Mq#
zrG8U{YjH!S3n`AEO3<krhy1Dd`(6Fz(2FqNsYV!}@)nb|gVN>OzMfK&wJrv*D}4y?
zv}LM&)~(pI3c>Y?M|&mF-C9uQ*r3+2d+sHrNWgB8%d-lad#6B98c={T2`k4Hf9~Y$
zaX$x{h1mj=WJSR?4ONUw)4mEi9=mch_~n*BfgMx9<+z`&9^!qe*}{f<%5TvN(a`gr
z<|26yI@<ILvLjzd+V7lC<|RB4Qg_mQJIFDR1h@^VvTFwI%F{7I{O*7}=0d`!g*@|3
zZBXy@GMm_>D7;?Spe}i%J6QSMTo=vl-nFgaOh8B*^W3Gic^5!=&uhW}mC?ty&9s;j
zizDF{AG#1h(CnqGJT913^&-_IcElC1GPXnpLt;G+_px=3B+MsSU#fX2XbF^jNBpmv
z;S&c3*}wDYQPI)gYyYQ=dV<;&#;f;J!KqkIr-|6j<kwcSR!-DCnRfZ`#-?M=zZ;uS
zHY_+s;Wdcc3CX?_pBh(9DCN^KU*t^tfdH&OJL<D}&_U&YlZ<g$*g`Q-QGqxmS>W{T
zwS55#AJT@|cf6V-u-JlL3~2i3n-3p8^de1^B7x*!Pw4sjY1J0ra^CO^7*r%^!t{_F
zGA7aA%iJ+jA4B8TDe&N$;9!d!C&^9ULjF9y08bnw`ojSA^6p*#t}cgauNrq?_HnU6
zPZ{1Y)p>r6_r~mJtp|Kpc=fusL1w&P$I;+j>+S#EFQBLhh9hwj&k!uhzj;G1B__P=
zxEc4&mjsHC8uL0K*mr7c7s3MR+2MH%%r1!VhZ+sE6Jzw9Z=qz`XmNYxIjJ(cc^Cy%
zfA9&{b3yjZ(b8X4$-LQnn7Tc)v9Sc4MqL`+nV&vk+#JV0UqL*TBVwhp`S}geWD7Mc
zbvBINc=7&c{BO8?sKhcI+fDx`KZX?V|JFIAKF06+sbN&v(l};5;wL8$AQ*-SUGqpR
zyyc>rz;;CjaD*VxQDv2Z$E}V5RGY568VtL~5`KO=z%LkN`u&?4_#6z~P9waoE_N;!
zHuEhwg$_fOe-|$=KyRMjkZD(xH1zG;th*~Kz3zPU^drA0h!~`*35)>gzi})*P)HI7
za7hkO1icENy&u6jY+$H~HIi!Dw<HvPaMLIwiuv*1W&*hAuLskfQJ5P#$I!1LYCU$<
zdv`E3rL_wwzT}-uQ+xsj>d3FY`BEU~q`qvkqY2;{sD3<855vGn6AsmaME9mmYyju>
zX*DP)zN~a~R~cX~ZEwxaLj9Qs;I}O<XM^+7yp)p~zIPbe-W%iw+K9ITlGOgx$>ZTe
zVl1Ys&vJ7=Uec*+ZLj<JnbTlDcp%GdcNwgsJ325xfXfkSOrg#6>g!4*)e8}_Qpi3x
zi%oG=6)lvNlrQJ|_cmNSE8C-*$ors*(}MnA3@6eu(+VwfwaeG5CYn$5jVw!zAB_wx
ztXVc;K*a#y=&I5sf9m_Wr|=ZwAn_|(4(eBu#I`A5M2##fYaTG1W|_<(*kZ`A@PN&w
zyl~^;+qbtzI`aQ-soI3Lx3f}muRQjbc=zo_a{NzSv5-N_m&o+3|Amm2!RXbl23nDw
zV5qR`fS)`Au(x586;FA=Y3s2BkzILMT6#WfrJb2c$_KK--8z*P2TdQeU*}+SB=+=V
zR96#%-J6?RJ3S3JJX{WyOAY<2MW`h?`L3nSQLdx*|I(50CubNC0LU*O*B{wsMn76S
zG-NRm9#vs7|J@J^Oka8PreUI-&OhCZ&q0w6G{pC9fve`_A4i1B{#|mp-pw^%gO&M-
zX^xt^+x2Aykz9@Z30?4l?SudMP!cO|bw~frxdPyV!uS`{N=k}(z&CFK$O+)yuNl<9
zJooEG7_=YRSn%<|EyaHP+Scz3@YYH0-^Y+kJZE!xq@Mih1kU99AKI<{BEM`ppYHe_
zLBJ>D;(KQgM%w42FRADH^(LGdT5NHkq*+wWmI%`7lmFLB=dhz?-4~A5fp7)h(hZ=^
z-32p6p2CZUaM1ugma|jYUoAoin~jK1iM#=KP&qg0U_V^!w`PDuo`5Gfe6A^ZQBmpV
z&u2;uAd^Qm|7`?x_KRcN?`-Kp8$v7UHtQ2#{iIqF0zKq+SXo(t0J?+Y(*#im2M2li
z;5_Yj0S}Se)R9tg$jPxuNp&>O>_Ng&c^!G)<#aDfYBC@|3KR(x=|R+~uOInx#^C1K
z8X04Y7@xa2fA(V_nw-2&3({Dst+#QXM|j=k^#!@T{b$=62e_cYzYLR3#d*7J;2AN6
z=Xrf*{WoVv$IqcRpjaq^)d>h6;PE|8Hyau%RZ*@a4mzSC#p8NRYr-g7`RS8%S1%(H
zyuS9m8Q+{^`Qo^q9lXDqWdvOv9qE}v!!Z{Z7oxw~i>M{wK`uDR`n~6PuBbX~?GpK#
zIG7jD5&&8FU+rWWJqnKK&AOh%L+!I}nbyph>QyLbnBce&!x%u*NH+mmmfF#Mp%5M2
z<@L#7JCSZq^NMU+c{K=p!T-E&boMK>&5KWmD6h89ecm?A6k1{LlMleE1>*NHukBVb
zr1aLFmb@KY`+@#Hwhl$|+K7^u><O{53F`2fOcyu(=iejX<wIRB@Cp4I9^B*5(v0a#
zWNLtZAMLfZ2R9Dhxc={Dq%#dZw(9mjT$wOQ6Mg^k_j%j8`!$rO7VaTQC3rcT6_F8~
zORo)>8Z9+bk$=gHUtCdyQYTk*jarJdh+OSdgu<`P0%-sEOD3c%!)yO2{tTJ&YKna^
zTHwO{&n8VLN$9w4ep;uOx;*G-#vmUVAMF*<nm;}3R=9S3Ds{NGbq1@_b#%0!2ATW+
zgP{>N;_GlVwr$0O`F?(KWJ^pa$^UQy8QHVc!s^F{{nM8hpdW1UnEz}QN+xuJBZXV&
zHLqlBWF#KzQB-ta_u#e^8u`dcvNvcQ13Li{jrvb{giyg&NIA;1{z?TWqS8SRm%0*t
z)i(;ieWMKEAsEpPtK#sxbnKiN1n1IkJyU<<uR^oelooKKbQj2!NIvz*;ywHB?n=XY
z=8SnMU>gnJoAMCQro*GEmbJ)`6&!#IT2&X2B(t^Y-@sF%>n|n4tamQI%C2sxQRq`$
zI*XOzP`@fQKZIEIZ@g;NY?YAIe*yF+a}9t%O~ymzV~_fEa=9{1e16V1mJYG-&j&sZ
z*z*DR<Hyf|V&<4#ZdzJ1^eW4CND0`U<qtui|E8=V>^W1H4GK_+?h7=4qzz0sW0>(F
zCp-WbXUd2>Z+p#bZRhyZM<!WEz}7HAvupM&BBV~@0)~GT{NrF5>Zla&^zGY2c}fnK
zCuiCoTjhDg3%J+ELp{0FWZ)5)Cixc>xV>bQC)k?n<<H?aqA-=NjDB!@>{Ge@<0RN<
z+2<Gi!xn*Ni18uh&-Z(q%C~W07PLQMCuyWk=+_Dk`>0`Rm}dEzD#@vJnh<OTW)1HQ
zl;!BW;76eqiatW(&id)uO`9MG*5YdjK4A5LIrLzVfY+NhRNh@tYVO}s=ZU0zd=Q6&
zNTBKC>YN-LNO#dQLEElO%vat_$rqviTfvv;Xe3*p24^k%`w1+3)j7gC_&$IgZoO7-
z#lL0h!|5Sm9lqsUu<qUqj82=VCuDQj1Ee-7Pd`7BC#|K|*T3KfMRDBIL*3<W!D(Ih
zqqscc_Zq(H{X4n+a!FioV!<|D17`_NA~yELvuFPEnab-Yd+la(nmk^#6nbvLjc-A1
zX4h`H>u_laQ_?H3Fg*G_)t4_)#9W3E+cfc_l;fs?hc_^+b&$+a=4E7K<KT5w(wV_G
z{G4sl51v$GIjjY&U#^oP%qn*RUXG%%qkL2c<%aD-rS$E$*p(<n3ax_Q0%;;V$B$gL
z+CaoO(3Qrs>G0SZWP8N}r_Yzwq4*J$KP&9BQDH?kj-RoVen|--wGD!G52hy<6%-WZ
zoVNV<5pe{ji%oV%TMD|^JVhJ5T@&%;3obslSe^hW2?;4Vd00{dNhuw!-7h_%>j27%
zZ!X7M8O6n{8D6o;&mKr;&W2HYOSCT1{MUlz6gsbuRfiqo>mIQx@Gl9rk2(#&Vwd&K
zCWR~}L>`0)v6#Z_@MYKFT~(3o(a>zDvdn{P&$<Q~&83YMinJCbKo%LyQEir%DSIuW
zHkRaz8X+?Mb~ZQnOn*3wBsgn*KqOQ3Jdw97My>mtlyCEECgE#UgymK1JFi=$ss8rV
z57whUk;1E@RQN+VMzExdNy!JZg$gKZo7)>JC@6;zW8Q;^`MrnhG`2UMpS&|0My}C;
z!30ERU(c#-*L2fhJ9tC;XJcW1t*R1Ka*C~9-4VhBBeX@WIL69W?(X~e2GKE0r>Wu7
z(qx1NvE2L2-qT(>J1?%{bZqwu$N<4H!#`KDt3G?3TlkJ97<wc<t3&qJdG1?M&Ju=G
z%;N1Qn%y9QN|Sh3b5Y$<S*{stgJ8Ye^+ka?cs9f%dB$M7KhZax!^3r+FkQ>1)4|n=
zij>jgc@H`#Cd^ohx&UiB-N%#*7_-W+rqq9$KgVGvx?>Qx<ier5jJqRj<Ig_i);bxi
zrJ>Ud=1GhX!<a1pETB9RN-pRqvhuAaD1dnG3EbU!csw5bqE=e_&Qb6E!-q`&s>tLY
zLRgpshr=nwlF-pTWeen;jwT%y{>whl17&Ik>oE3w^B=x~ZG$8Kbrh8FUrzavDG~C@
z;aQ*t@DY-(Tu05o7eoL$(bU`D{mmgE0l^qW`Io$ul+SVxKiKet4vP#6@fI3yZ`~&k
z0X$%3m1csKc;8eZARh;Pj(<G{Yvj{RcvL=dH8#%O01yB~9K{y2Bme{-Dp{qv1s9@v
zP!nouO$g~E0hw^|sTu2zS$R^(_4*gvZKyMR9cRnqkoarcP#T?N!5O$?{N(9{tWOXh
z<QEZM`qMO@b{$*`US9Lpr7u)fK@TnKDV%^o)}WD2mn~wiRw&#5HzwJq)d500F^r=_
zULexyjgQ$spa{P6FC+O|)kFkKymVsKnE~v()~oo0WPtqk54X%}QkLTia&Mb~C-;90
zL=Z)2KC@q9EX@2bG)OG`QYM&8h}EkH*?eA1ovm@-s2}1}xAE&A|9l9!e?iBeP@*gT
z68vM(v->|H-Vf!8{S^<bSIFnAb@53BMc2(!DbCIXHfEtVj`_{;<@yrymCVvA$?=!%
zAHN<?GgC3;07e*~p8gw3{81bWtrWZJGcrIp<IYG9k*IR->>f{YUC2$L#JkVeUv{}I
zBez3oDf9jw5Ksea0z|stGBCMovc^6ej#EQGHVu?=yIudvH+_65>B9Zy<IKUe=(U>l
zjjlSkcz}leQ#*KsfP5$2cnDv1RUxnoa%sIkUgoYoc51`ly*hQjDCwsrwOD0r;ZHr8
zSNXXYlOvM*jc5#|ty?q<Wd~&`l}#tEayl2ce=?h7OE|u$P%qLPQx5n6pkE0w&_c)p
zV@e9G?og>|jo$gkZGSKmq_vQgRNpbHTQpYljq*<Gi<=L?o=E~Up$9dU`kk{Uuei7x
zyCtqJZaA+o5j^6q#q_TFlwOT4RgkyH6V-UhtI$M#=5%B>_PDISRu=L_rbEr_INr-E
z>(C7k+FIKP(xwr}NbXNva_MGFtr$defEq-w(KC%N)jhZ?PrIgI&HWOZ+pn>*VgE5!
zq(JTB?9A84XLok!O~vMT(fBW#Hb_PW1_eF(sP+_OTi`C;1VLd04m%vhuviCE#fCF1
z0MP@kRYx6$dJpzIpg_a*&wheR1itx3V}~JQ!k+3tD6k4v>%3g;*H`kVahBQ4qaJ0C
zF-9=+UX3k#p7k||<xxF;EKe#^>RANk0E^C3@DC(oyOodb-GY_<NO5C1*FgQb>{9t{
zonsXjk7y3&i%nQ(FjYH}<(Hely#7bufl!no7~ug@A4SmXZ<AG^Nf2;u(U3koPyZMa
z0`#}6E95tv<@>j`>!Qi&pA1L6==BunaiqnxB@h$(>Ib^zO@;*(MnC|YmIkAudVF?q
zQCZK`vYY}2p}`>>u$^stZu(yk&;9DCrcm%@%WYtE(?|`olXb9xsXAs&gMEK=2u5@=
z===c_4;)Sbs?5*ogj~XWPEO<r_vVgcjECN6Ld(2IQCZUAu4&5&AdkHj-*-Y(+VZ+z
z*Rw<OZa9cZaE2ZreVAYBjG)0LAV6t0JaqFE6BAQZ#EQR-G!<IjX}EuKLhH)(>e~p^
z+dI7SWqiIXKSnP4%zLWe9r1x7AtAxR#`ax17&ET0`V&3s8f<@G--md3m2MOuCDJ+{
ze@2}CfHpXBzj#?0=#Q9sb=wWKD`gg)IcRrwrsK>0>iA7g;(pBWCL`k57i<D<NZq;G
z$V5sMHjZ|e-X_jbSJb*hNYau;0*`IRa`y6n7*3otG!umWQr9=#-4_cH3Jp~EtgRJ>
zl7IuEnGf`&WCQkc_qV-tj@2r;oa$e<wY22WskCV^y-;<SaK)1<Dw?X?R<VossU7@7
zA3bT2A^^|rkH#v-OUL{P{-w?Fvdwyy-us8>@w84u9X`A5L#R+{GroJMml#*kT8qx6
zmMjM{#!L-t8nkHNEYN|>7A57o23ZA#5TJPWTORC}2norgBpaZXkpLNo{(ftqx22<_
z1L)E;2)OW46`g)+<!_~j*7o)Y=b-25%DD_c4c^;~D^b2ZOdPjBjhKXFD`O7=7n{pl
zjynK-jR<#ksF}5o=;V(e*%eW!v~`-+bzHKX9mBP&AH5SFt^e*FO)P<c{-5-<gYR1V
zON*2B-F5fq%c9$@pkhg(5AUDlC}D;j24KGfG#`y8%c|f{(>if#pQ{wAp5H}8<FQ{!
zlUL6LtSE-*_n+FO{$Zpb8nr`6xw(jlDLBW?J|`_WKJ>YV8r0@yhfGzK?0vO~DAnFB
zk&(@8y&6G>4Lq=wn9T2PD;!>~CnurNb<b!`M94aBa=dToLIfp0Lz-b8g#`iK)AmPg
zj6(#m5-S@xbT5IhQkHxo5g}nNpeX-RrRr>BH|Z1B${q$ayg-Y47#M1~s+pN`<obZ4
z2nI8Fcw7Q@xqK4;Sr2<ZnB$Da!8rPSReO7)B)y?QkcEZi$&-NK*^i$;rz7}klxuD8
z{Hz6n4Uy7DuV1Ii?KxY-B|GLX^<3H#KTt8*&3*~gGj+_X2OW?^|JAs=B*7laI2yCT
z^B4LjC7I|4!kAE2#WTnSUNZqzL@KM}VBf+^InZ&*^Qj$(HQM6NIa64U?N10$CGo2P
zE(5@9-zN*l)xrHb+`Ef}0Z96b&K(G6oe?IQ;|SV3nH}kNiUW$092_q<XQ5S->#r1D
z0V8XKwF%E7$BAssc^b&tz@cThr*nFExWQsuuY#Q15+;=|1X-$>n>cuu(mCJH?|Jp`
z0HY12fy`hn_GID~)LNB6@vOlU)<{5QXP{mFgp<3<WxUXkq;(MueoP3nmB8WkmSUa^
z{|99qF(kkn!XTFzJ=g~3K!T<tG4;xGMTnPv{;J+Mos$|jlvB?H4GAHzm)AZ^O1QC+
z5OjhlK9TeTT;KZ)Xt=32t?#nlvG6gDB6-o$(V6}HcJtZWg7O+en_VhL^(%ZnsiQE-
z|CiQXS>2kG4@*}x-kF7QAhrR`s*}W@I-SDh3fHbvjWa=4(U0)qsjjs-&V)g}-%3BJ
zVP6f8K5t)Lh2C0oCb_%md@Fd_HXrEiu?g^=AxJ*O-3M93UD@tE$h~`XfY=5ykGnpy
zLx!jYh|)^t_J?ayzztguwj1Ly7k$*x*=IL7E+s~|lLaV-<6~ay8w!mAAUPoa2giXQ
zfrg|a+e7cRs`xkaHB|eVaN{jE+7Ipo|3!KAa33}mT)Lb~`9;)Rxhzf@SEq{_rlJ-b
z-7r*7*5d|QzXh(?3PuVCog}{F+p{yk`_mwxw>N(*f_vl*`-o%`GM2_{{Kz=Ag{B+A
zJ-z0m_}R$hNB5KqG@%IWH4`bkTTYq>U$xb(r}cjfD9o3*t<We_P7WFwTuI2R1S+13
z(ghY-ysJH_p8dmW2zp-jyAh?aR!;-Sn>>4VmsAiCrSh|Yv`HC#_6sNi(1xhnYa<2?
z6eD9pBut;+%v3@GPM}#UDCFkpX}L)I&|IM2z1WKuQeb*eDxi(Y<eSPz;T+CpVB8|m
z)J4<*bZTkMlqn4tLkOlK94TC1V|eHE9lm{G2-6*F812)=q<ibtt^~SCg&9M@r<;t(
z{zkVvq+>9fq&C#wsZsu`aFcxySjKRovrMB-O2#b(1|>QzNfLIpaZ4K~=NR?gr=zBZ
zO_U^V!Ab+th5vO{jUA0Sgw;hOV`I$<_W@h(a{rbcEjv3MlNFM?`8$h4TGWm<a@NS-
z+2LzN(#wh_*I(`!MYU%Sjy%-zgqV5Gsp|~4KYDv?3|tFt@eqaI8WokU->>QiPTDuY
z;Se3MD|L1#{oB_s#LffC4%x+tNHWIbrryA>>iQ;>`<EMDihY<A?kla0`^Xd%Z1dJV
zA%C3~*~rv%E&P#8ju@EI1qO`~$qfg6{KZydq5XUWxty9>a5j{yYR%de0D6=hwc&i9
zfle*j7s4!}$X9WQ+KqMpFVa@23t0ddkZ6nBI*FUmu^*;yWXr@g^pLtA*}fq5K<4Bp
zvRfb5h2Z8CJV4|`4-(?MfzeX&*E*lAcL)`cT}@MZ`SZ7II7D%B>Ne((xbP#{iY2Eh
zBn%X3?Ck)jGWdSCxM0|G0|t{xYW7ES#Y0+b6oWjq0!`#^%q$wpsQF|-5^e{;AvmJz
z1o5_C&!2c3H5MO>+oqe6`|B^_t=}ayOZB`-L-7~`u?mOY%A$Ug2a;A6381{B4}hi7
z<C$#n^Y)g$Mxy{xMuJVg;CB}`-jtza08DwH)p-dQ?b90xlpa0wCi-Q=SZ;)E>*cC|
zfJO`Evr$pK4<>l8*T22)tk(?`cX7t#R#$KTpi*~O6&Lj5H~3CAr5vk~&m<=`A;tBb
zh9S7RWXsufm$Yj-tP^-*APUwr5b`zZuJg+uey^LozHk-n9a~Pe1z8cNkAntdi`U6H
zll+IP3IRYPSg_&3C5y7Zr>NnZR63o%l*XlmDXN!0dILfub3XVaupiEMey53+o<N(S
zPm5g}D;s|XT;0CC-zebMN9ZS|L1GI;hIdV{LqHHWJuv{|0s1m1?*r}<c&4k;j~Bk$
zTwQfuXt`IQ7N3-aA?=&^s`X<Lnv8kw=*DE3$uJNI>CNv--Se!pp6$~#6aa8#goK<N
zw!Ur#)SqP5m>3o{pv}(Q`LnK+X8X>q%it%Hm?NC|=OPL4+P9QXV1Y6MP_B1>q-jr+
z0;I$4ST1eAJ`=M63)(-=x^N81Ua9?xuD+AN=e0>rYLxic(X%z~($&9v2o+Nz(3Qut
zsjTn~d-8EYa_jYRXD32`eOQ<r)N^G84^%5?Bqs+3+M^j<G4zQv)}0Mk>cY{Ia}q(0
z@ju)0XLVB2ZmHF$5W9#ukbF?sdBScYNJtDp_X;Q;g2i4{H2tbsu(@xFP66$0L=kZx
ztpc5jH&OE05uE*bQw^T(l9DJjCt)0jTG2lfDeuZ716D5nC(s`Rv@nO}+KtT-qMFIS
zA{cn%-oHy*2?~=MU_c4T-G{%*$j+`J@{m~Jfg(uz_LYbKM9)XS$zQX`rT6T$|LSO0
zL>3yP`$<1@mXebnV)#%>IEB2DK^EQo#M=LYy6qGKvao6}`e0PjdgIEsYMrM2r(G$k
zdhjl6JyItZ2tS#9sjZH3{t0Bax(`%;O9=*tz>D*r=m5}CQ&cB(iS_ttfm~Zz=HWSg
z)CMn6Lh`Tge0Q&nMr0f##ppH27Ah1I!xQpYgRd!^Q6oBu{6$wy){opkkW$tit*uMk
zY4&>FzCaBaj7e|T+MoQ!|9|~929tXQ`i^EpTLp%??0A9H%*=`GOGX~-vYe(e^WDbd
zDiTvbic4ZJX*2!9fKsks{n1$R#wPKs3$;vk-V&yD_P$Z;Csz>>u~b~w9GzOOrhZrh
z4(%Zbm4JUXzQU-zf8Td0nI+Xs33_^pqR^l8cz850kbi$auZWGcx-{aS4?DaK9HZuu
zxP-sD+I2!`sHhbgOGgB`?w81*fLL)WRmD%!6Ay6Gm-fjnPP`#Ddb6OY2)?4m{}C<o
z!X;Cx8h(FLqWE!zo=8M5uk+4>8e-cO8fpx9{f)v3jrKY?<+n1lar(T40$o104)r?g
zYs`Y)k9V8HagW9qUZS6pMx@%>o<hL=;V_jy@2Uqn=mW|?&BgP|lY*a}n1@hwV^?e&
z3*%v=rKD;z+lCJS?0e=}snhUEA_*AtR%VP``~&p)$r(1&AOQBk103$gJEi%b->pS&
zov$sW#$XuGhP33VWezL<sJi)L_+htaOANyAejv8p5%;ja)JS_xzk2D6pVN`Gi*)rJ
z7z7U2!vOi*NoVebS^Bpil!}bper`~oe-<4YQHF<xn!knO4r;;k^PkN|xHS(f{iYL4
zbHV0=N?QVP?AA`TidPQlr1$Jyqh28hC}D-z!D9jutd<~Kp<fbrB`>PJ_XExL_4UbK
zcy%hmxk#1BAL*>3*)@U<{Hj*!qK1c@Tpjt7w?X+AkD|Y)K(KfJfFp#tuIr3*WGqfb
zI{3BlvJ=Q0Lf!$QtmKmDgT9}Ve<z(fYonsncO{TaKsPaHuzgHV|Fychy0UU?6rJ_M
zpNx|kcfy`{P_@9{02$;!e&>Er79IeM3(8d4KO1S^v|xaW7|!pi0Zkn-<*)J;R?DM*
z)t_z{oWQ*E$wnN`_Y)uiaRQbG&*J<1(X@R}!35e+;2pD5=pb|i-JrIZf8ux-ZqY4(
zTHaGgLOvXnVQB;D*=?k2{ZA9O`}*zv3id%t=mK6MKpE--r5&I}n)Nhq;ZJF2`}xHw
z2&^2G2A#37BN&)Nd;{+O?KJOsTMAp;1tb}i_tl6%@_S*o_cBQ7BMuNboUqGnen!gy
zHR4@=0>p3>s-hyjT)PabH`ae1th*YBre6!WTO&Lib!7HXm<{2O!~O1Gqr@77Hj!dh
z^7<_Sr|bKBSQcrlE+!+7Q^R$9REL&`25%=HGKJkbX8oAuaoFZV#3=T@E#vk(4&xV1
zAuLOL1;T}V2HGOC+^>a&ho>9Ne_mYA-&Dhn6XqKOR}**>4sHq<#=fw-!wy~_Z+UuM
zKu@GpkJ|_d_91q*J6-FsbjlCgXUQC2zXpnVReC1U(hF@t3NlOd%(E(r#RqGH+z;Jc
zYkG~j<I^iGyH(O`MA6Pj1w_0gHVHDQau=P6h=0f@JIPdOv>3j-!~jW4MU);Gnp=?r
z@67|FB(A4GGM}eVL1_g3XN3yhq<toZ%eAT4D!bD<)*8$O2?1{@q15=0e#>v$H}@4#
z+)Z_IU>i8avu0L;WO#VtksO$nI1d&wett^xLPk}P(H<Bnmzn<|b!puJTf4Up(Q2o7
z*f~4)M8LD&#l)+Dk=h{DfyuNUw^Wi$!CFVhOl{17E8g$?HlN?0#aVcb<&N3+GtDIF
zR*Ld3Zv^J8Zq8fl8C<6kRI1E5y9s!z`$NCrkheZ)Oe$j1!CQKt#)3|QaorajPex>$
zcWNko;H9m5<{v&~^2I=(@ZKW|u>7JV*L&xJ17;Y|b*JQL+Bs~MHMKKU`T=OYi|tY}
zGn18*xcHMDFyN%fxw!U%ALeEgLJ`yL_*l>j!wL=4Tfm@=WABiugMj#=rLhs~jFgiS
znoipRGbR0?n$dWPQGQ$ABxaK2Ar3pLVDP3I?h!=8MN?C-6*(8-jPyN4Kng|rEa9OJ
zqgL64i#@GZAe+L|Sty3U=SCPq2wmJm`-{RXk4p_Dm)(MJvCxM0tB2jbnp-c-k1ed<
zU4=5xY(+aK__C){pRunW9Wt`xXK69^NAd4Tc_MFpd2pWi=*vApIV?)lCFdoOtG2o*
z=ifh1{Qmt@czBf!gn=O=O&{kNOs}A2E*hCP<#j*yROKy3ch{hdMJ=1yHiz(iOKlF+
z<>aViB8c;CxZ3$iu6P;T6*@bJv*cVJEJClWSD<9RncMm~`6yS_#_$%`X%phR**zCG
zfB&Kd$vZzj7~py^aZTOwXMZJxMahFRK><w%bMjE$Q@np4=!G1F);0)COEBWHOYvs(
zq#w`!`E=JsqQG)51Qnys_3#nes7Q@LkyaVi>O1zrq|t*a;}yZ`Tz$^RsP}5`R;CKe
zJ_f+pw)}^BeGP>rS!BpTnI3f)|28)*cf^7c3b-Xr7o_5Pu7-yYQ$ss@_NVk*U!IS*
z<mQ=+WH(=sidff`lvu70trm=_`lq*Sl7ey7NKQ!*=TI2Ei3u~rb)SQVSY{#cp{)25
z#J?|2=bCd0qh?uFKoDANuhPXyOnM6TX<}^bGAnDD#^O5z+|Zfro!2!iGB2V$FOCI@
zXF2#h<{cWSQJy%~^r@<o=j7+p{=E$^y^t48SF{>xvwsG=W2lnTX;xvjqMs)!oI7p(
z*eekp)@X>rF-mgUeaZSX`UGMzHmbq?Jr-$dZ{Ge%po<jlR&Aa_`J!`!(Yc!4X)})0
z?EH>^$CM>&zsDMxeQ}>=&&TT!&mEKK)ubQcvTSFhYqVnHRYfoyl5wAgoy)U4rD+!E
z@Z|br-C0mZhARX0al+3X%#^&qh=T9UVqgkgWd{wNZ3ispA1?>kfZ|uV1&k$=%C^%f
zP;V5m=dMud0ETBZDld%sJ2l^-6I2Sv2Kr{`vn4-}po$Gpyt5BM6u}Y^VS;{eADbfY
z{ZaWc0qII!a)MC|LHA(g?l6O7d{rJDv~^G??nHayV?;6HYr!EeiQ>4@p`uMTo~b=Z
zQv*)doy7Hs3kAqlZ|N!Ngh5G<rS&YeeDk+<1O%~Aq~qd=WAmTvv<`u~F0cKECw(J|
zTdoZEAye-zgOh;WIie|tE$u_d6pAM60<D9-=3(Y*tG{0W%g~*rg*a{Dm=c}7VvkSn
zON4ZMTE4=VqKe390e6}0kG<A;No-lUK+DFi*FHuUGOg+&jv)>AS9qMvN8Rf##h<VH
zdheVx@PU3O);{D@+Fj`IMb!aJ=k!G}%%}9Sxwq`=&1w=g^z0=V*>v9ao?}9wQcL2z
z&C!wZJsGE!@XW}_1($`(eS?`!m?_~t#O*y8^UEsvEW8^wM^@{}=w3jb43r3hxpr@-
z#Uc?usC-N!|NVi_8MuoWe-`-C%VfaaQEKV4MOM`M)OS9frcd``bQES)qAe-stMu7y
znc4sl>qEr>Gd)V4on-bg+Fo)@IIAa{Ho<P)>?M20yDU^;LS&A;<D$Y9!DETluPNMG
zb3>=$jeO=GZ}L;%$p1hT$NE~xnzfTd=qc--S#}w!iPo1w#{6bXZ~YUEP}$uu;9a`^
zejvjC?3zJUk);{$ZOf3xDLuE(Mmt?mnBayr3#;ry;(UL<?xD7xM!0-mYA~~k<j+e@
zQaE5$N-Jj7Bq1)IMjf>@?5~X-XjZgBx%z=#pLs`bJ4Rx|V&CpVUd+=&$Y~No)6??P
zMIF-ftn+2eGb~C`CyoW@Oa0RPqMn|<s+$m4tl=wNLSGDE9iB(6si}F)(3jgzCjtIK
zB?uN3SynR%56Oj)?Z5OYwYIt{DOEW;xxFl8A7DILcRqc=mZ{*Ui2JWYajE|h7AA9Q
zs{E1_nO39Z$eBM$=kqe%OIWW76Ve%Juc((l1q#&@#1&^7bc5u)`H@I2bo9WweLIn<
zx&_Zkp2;q&i}7ea+Veya*yz!Eku&ejcf8)*C;S?^XUNg!Dy<t|2E!IUsw>b&@7A>O
zSw*@|$v`X@IDB4)&W54Rz)*TH9uxx#tHOMY)N|m+_YbNTQJ=fuf5;=>g-I0U=bOwQ
zU`WU)Dc$XUp>lvK>6fRye`UCpb|7*x{UUT$>ES<q4cK@KM({kU7s<oc&mP2)m}=SZ
zlI@*$g>tjoOF10M_T06U0{)q1X-v#N_^0bCDKGC+k8{#I&Wens8rwzrd(SALVpxdS
zxOWa9L*PLVzOb^hjRU3^IRKM1iDYu6dW6M|aYkALVvXS~irW{Y_v&UuDn0gFh8vTb
znx4L&vAgvMhhErALIIc#A5yO5yNc%vskbufJQU2#dVyQD$j)S(s45~L#*FXPQ6GOV
z@ec2ifTw`==qv}lAN|u!&B9Ng<OUw!LwX`RzVfi_CE7lOUhMY~JehY!@f7otVA^p1
znr5B22^JuIN-BEt4<mJYPRyS)GcZ6mL}*qnf*Lvtz#+<gaIL)I_y&G8h`YBamYf4_
zfp_GOZ>M<^1pnW^es%{8>sr)Hf-^@#4)NCCjzY;?{1o-KlLtUR1peQdsbrkkx0aj_
zfj=oQ2>m2pfIZLr`~KcJBc;;ofIUG!&bdc*3m_zf-w5fW7Bv+M_y$A~_#6EDuN>uk
z1kfuT^Eb=@O9^PCfG;H#JXUZ}md=5Tt$=u|w}0r5qkd5F+y^>v$p8l&!e0I;06|Kp
z%ZE%Y^9@gXQ2jdP9z9)FN`$|cmLYFGGBM$FUhclwmi@uA#s}@szVyIkUwsh_J6vDi
zY=s;gSP~MVX4-RdldI<#%<Uas@5jK(tDK*S`+P^(7%QkLPCX4`XK99U&Vy@_xmP9e
zN<(9CaFB80OL+K`pW00SfvWh7am*Kwz`d$<_O>~-fmi_N8qaOg+TO3UeoXied)=?%
zOzWc(`OgKLQ|evcdY@ep8>_j?`gD^up78}=jHnz}`8Zy;3UCc(W~hCMwR27<8VvS7
zew^fZ(*^<PHHtC;6LthrNK(>q$UpAgv;9%IEVZOO0;72>hz@<I`1Ca~*Ia%=pYsz%
zNu|H>Djf?{Eb$rjAKN<JbPgRGQ~8W9q^%as9E@sH(+40SI`L&$T1uy%latFSy-*;g
zJCn|(H0a3r&SArI3aFOa{9|`?1+Gs&Cwpo6(phr3zk1c3s{+j=xV;R=N4F(f#H#M|
zmB`7}B3@`dTQrYogbDlC0qm)ZV;e$&pO>VuY|<0^xRNWBmFxH;F#L(kqC!EyZ;^Tt
zB|vNPtk}9B3E04YK&Te=1_KfBDZmBeG|k(HKYPZgRW8Y=>F9WL0fXwQs&ZIK6JBBP
zcNb`?Aj+|Oam1$|!<gH%R0a&&TNpw<erY8WL`8inEj{!8(8?JT0MI!l8#uBd50^7s
z(jAmUyaT?DX#bov-?SRN;-eY9NI_@!Q%4@Hz9dg__l|!fO3f&1Q<qnKhdOC_3LPka
z-xh!>a>`7xepz~;jJf&dLhI-5t}dWA@+2(3q{J5Vx#4}ztgrW+W-~g3OmvKmMK>;z
zbIf*j_8*>L&Z|((8lq~wuqd*XoL%N)_rBhko|bj-YcbrS!;J{KFhohb?KGfwt+GzE
z5BW-&KHGHPH>%sga`!<VrS7A<FKYjR1QDNdZt*+sP(1!%DfuO_eru_7IJC#41xo?M
zByi2<oG8D)AqBveBbpp2@H-s)(xrZ8XD7^&3AAoeQBlcKi=-!k4D{)#i{Nd){>J0E
zIXwplj*J?Yy@fzRc0KKH9K({0xv0|3MJzJlFupl%0?{ono~Ji^>7AGc12r@!+3S2A
zC{}@SNg2%IOwh%>!?@n=)AgRqmuMEH7w3YLB?ieU%f7STWxV&32sBGPB7#V74YxeM
z-V4AbP2pH_Ruy{=MX1-Tszu(QI5sYE4_xp&N`pWI;<=$=ksKjLEpeMp-dLmpIJDZn
z{{Ae`W{WVhEyCAm{0r3>0o4Hb{S~N5F~JRg5j-9g0a8BA^Bvt6up{ZG^z;^7+OJ=~
zmbzE2_##J@Ld3{uIaehcw6B2<wdUsL0=1_!jSOJ!W9YNA^8NjNpuGSTeomeA^cpR=
zs>;h7b)$Q>olt4Q%tmlBv$G+IeL#ntjL#t_Jv}`)SL$2~K(^|3E8T82yX9fM2vR&e
zHWro^Asl@ok0pa)VU|KQ7CUyU2=qzyg3@6(SGW)11V>UVp$O3+;<~&yiZj$m#+Bw;
z;J)H>|MWX5Bmqcw`#dTP8S-jh<~tY=+ms!D<6I2O>-+o0(9~RFIO4${iT4zbQF$&p
zL4BG8qi8%nA;E&^OR)_`)laLFTovlg+TLWL<lazeKwq%3>g(<O`OVSX+4%&Bo`90+
z+xN!#LpJnk8X6Nb_3r+De!wVzV4CKoyu3UHD(-?vt`AOi3ZuM?jLIY|Al{&%(+{aW
za~YAJ>OPUaEM~#>AB<!o1C6s7(>Vyq@(~4&8{PE^gZKjI>3=h>2%i4(?Hk^^iYD}q
z_&B#&KXirE#Jz49OHFfhB@M3c-$uH01UJ$<zl@Y2C0_)xc;v>)wF)L(XAKg*I9ToH
zj_(DB7&<qsAoL9NgXV++$s3|_7K12|!?CKe{!tl3s^xY?>fc=QW|V%#LG>7-1Jg5H
zdT@RF0j-EZ4@10ikabQA9hsm#>*E<igAB%s@vi^~!cXEf$u7an=v!qm2Kh{|qZ}L}
zvf4sFFf+1-O)Fvl1BG<M6io(s;66sb)TSWlqG)Vv3}45kY25+n>j4tLT-BNtP=E(h
zfxE}ru}?{Lh67*3DMV4vJ1a8owZE8z#THf!?zU9BIl3wI$GEy?4&WKw%6IAH?}-0W
zgGffiiqB8Dvid_*)Jni%Z6eWmCM=@tyGev)MOJhqf4k5d{bMj}ST4SFLY{%Q`X+e|
zxJdXrA(Dz9@IzhK)7Ertqnla1_NTzYu9Mmzmv!H#kmL_FDOC9QtIwYOoS9s=1<{?V
z#XbOxRVlm3zUZ}{mZodC)2=sHLMca45C3r-hA(x$@O?)5m2-)>Yq2Oxjf2+%xG}_B
z7J8d=#mm;R<3+{PyAU4Q&nJ+Dq01vk^?8V1ww)JpjPygyP+^F+mKHk;3y0g`-HYRg
ziv;%?@{!{$ikz{uY$`+B*E-cz)yB$R?jE9`GhvkP7ELuN-f4owmKZjz52Cuk?cxpM
z7T_zZoEn9PwY61VUT&Y>(VS^T*&u3I{=^tEXH$;*@I~=UPEP1s!v)=2Tl@!~>5Ief
zu0H>)MUVdXi+y*DNXcZd<M)w)QPFD-e68KY>C^A7b{QL*?sKsOHNxiTc?q)F)y%HX
z6Tz&kM2Pdg`&>ZR8L3%{5*hd5i-fktb#(MZszh6=M>x^|rE%UlhCD!26Ofp_?efem
zBjRiJ^M7Ej?ega%nh6ZQzwzxLw{clbbu|yjW|ox=FWUhUpB^H*P_1-v16zX@s(R;Z
z-5|J$G>@Ht&BcZqzIvN>P=*4Ww*+V8ni9@8|G#fY4ky*+uRtUM#lKleRdva!RxbVE
zD-gfKW7I;lB)3=;?am6D8xF%yNFNq~BFySX1L{d>DckWKhUCoDE^Pk|y)6X|4FXgQ
zB2*r755}0o3b0J@nZa)qL*h_w{cY)KV*$JPaBnQjR|mRGGajLSUommzPZ}6dtlY8F
z%M_7LeuN0X(sx}Db|V3(_dlUdpJ42KXgXc&?7g%0G`(b0R$c<PDgtgee$@6(q=0{~
z;Gduu{IokC;4#)WpG%BXfC_$n=s)md2B>h|ob`KS9BX1l1lI0++q^tG5aH#3zkj%`
z#$^@d<%cIH$QR-s!eINnQ_nQNieT@F-vW5@uVmt&DFQYD?r#XXaSD#Qaya`ZF}EP;
zR`(;rw_p!)7M(#n)!R$U#vUUHy-tV)hxk_(=|K<|z|Wg-h>LjF`%2#J+-<M#(nI@s
zo9Oz>-rn{XYXl%|;&OAP@cC0IfQdziC1?OkkM8~(Octnhcc0+mZt=Sv$~}4$G!CgY
zk}-03R~HvIbvW!nCr+aJoB=%IU(h=vcs8|382o_00=2m>07tULr|at*H?`;Iw_8-I
zuD_%|C0!$6cy(Ua%2rkH8N%n>9}6DgptnE{t_r*tAZ1dV7>ZS8shuRww0wziso2sc
z$jZVJ8C`j+|GpOoNG1>jx|>n_?MFevQ}V-ycTD47)OB)gYu%}+L{*d@l$FIa`7Lnw
zHNs6B#2)W@``=g!RJiPw85x}|DB+n3`};%v(62R>1`OT#>zkX^7Z+EH5sO#<AJX1B
zDypuH8&>Hq1!)kKZbV=R5di}UmF|!bkWz*mB&0*Sq(nlbQ9@$q?naOyq>-U>fNu}R
z?fpE@`^EbBhqYY8c+Nii?0sFo+A~cBm=&pP3K)Lga67w_YhR7n>WrN6g(Z(aEjI=X
zfJV*B-SQ=Rv~ZmoFMd%4#c)D`Bi?RFVDrdz0M)tRR95zE-5}e;wJ%x@P=z%ITg=qb
zweC+iJO#zK;r*eReL~Q={eI3^^UzO%m;Jw~(e2fiw0CvoW{<u<NfP=EK>$ootMi6s
zV5&uPSZ^v4p})3O!E@bI^wZAjbV*_1yZ7I@?;;)d3U%T#a@s*c2E!;xTdJ<37-B=T
z0)v=nDXrSqY!LLAFJHU}O0mMLK$pY6%St?#($N9R{iLd@M}FyZt7CgAbG5?YENKIt
zPNTAd*I$}*++-S7$Yn~D??pzYcoGF7R6wcs8%M#<z0u!oz=VxNEvsYg4`~=P`2_zO
zYY^sR6>DkvU}}RTW$y6EZ=F5drWlJPa=%ma49+_L0|{_4I|Z!BqAX>F`I1Nj8uu%C
zV1n3u7*hLFO*Ub3omKJ#YBDwV@pFW`oj++krVl08>opJ<^^F=E>j3I1qPFH7=o`De
zH4@nI3dXwk`cjsvn|=}+ANzfw5q(HZJ3i9sqlCJe0+(ON_nQu+lX$i1mH_NW;_@=a
z%5ZdQdEG96@gnIVyLduGsFRXwqJ<ih$W(5Dd&Cz+>9{|tjg6ID105H-G6#2vhW)6P
znTYj_PAQv+wjfDK$z6$vyY~}X1dO^~R!Y6f7vRaHU|^_65hEmGvY}2~1Likt%0-mK
z#8duN2Fk%H8tZPg?yptC0m|SvjGMrz<slC`q_ejdR8DI}<0Hk<ItN0dCNf8m-^Ci2
z$-ph!3EAG_usLX9v|q&73v$qcqF3UxYOQcv<N*NklqHyn5KGm=5)QYq+@@dCE1FRO
zlxw8LZiQa1Wn}2{U8UJ8&*}aetPb>Gy`d)%p2D1TEAfAK4{Wn%M8=yDr2jV}By@H6
zGdDM@Z8Y5^0G`aP)$`rrs(TGUB&Yr(A5fuy>(2HC7}fe3hbN*)OMXp3S2rkCK7{to
z%^N$CyN+AuP@xu8p*NYS{#iJ##v<^B4VaXSjmLi$1pNc${<L6iMH49KC-oB=!nrHN
zE0S4%`}qd9x=2%l89?6>t;{olUvJ%Y>ehGY**@I52qmXTaTKex{fXJ3)>(&dNDI`-
zqoF#wr}Lc_R{j@B?!)9Q{#ilp>A}-a3{uZY^W4V!HSE+Q>T+oqdArL)OA-?L^K;P&
zT`HG7Ig=j|ISUHL*2=KVd$Bfv74>dwo*yxN<yJSu-d=6rsyJ_WFd|~%y`Q!@ET$h_
zT!J?6%caH3@WbUG85#A1x(5ISQtn&Nn2#eE=?Vz^Qe84IYoFx#yg~`yP!JI?^;Kg%
z(RM^vaLU-iB4p`noaMiJPf$rK7Gh(3{77;uE3!+X**gNE0&p}m7R<~C_x?EPPpfQh
zHcj^2z@Sw~?A|j(jgghY##4z=egmX{S%)pCbTl=4?CS<t#80NVENV8Il)dbzkgu?@
z)I*4N^M<%VgTrC~r(j5~=Zy>&5-tT^k`}7qbsjn8k#*}ATyB%o=Ik6f1{Keq!Opie
z!q1Ir&wH<to!s<%O0w%dB>LAwojq)qrY7`Bzh#V#11b^$5htub^4z=ftzh*-qb8(r
zi16kWdb+&~%o!mDC0?P*ernc70%_<T3tR$=w80R??yXo2u{F7OCfVQCW9uCU|3aQG
zScZUw6F0+p>=EAB21^IS>QRo_ucvnnY|DTW%l0LQ`SnWtb4~A)`h02W2wzlm-;q7W
zAi9(Tu%%xB4!VE<6}I31tSd;RL1l@Cm7aLJ4Bk2S*G7v*fqt!eok$;)u}Wdj>o_{X
zYVBa!@6af0*FnrPaO(Uoa0=%4vrzf#3sJ;meBcUye<F{J?D8_oOPT~(;2wJ*;U{?t
zl}?aH9y6S7bhnOQR}Xu`glpsl3Wa)oFvXtr`m>Xr6OwxH8|DLORju*@RcGIFJK{J^
zUsIUN)nIlL$J^mCpopUG(8CAL5Gd_+n1WFR8etpUfPkHIex4#e{=uSd(IF@;eq;OQ
zpFG#8q8H4rRef5i-(eOf?+mQ^u?|NGZO0^fzjQ!<VS_$AGPy>tN>B<B6JDe&rtYw1
zHz$S-aMFie$o1E8O5z*Jirx56kqJfUW%Fb=8^9i7VqdTzV2jQedggqP?B$gtA68Rg
z-sdLEoAr6;iQUHYoHX=0_WPEW_vvW}2Prs?=;61W*qlIdK~E8f-e;4#c}he%uA5T2
zb+)^7nm&l`&UlLeFbUX605y89rzpkaL>8<*KMRd^Xg}zM{tt!5OH^pffMt<57Omm}
z<RaU9wtwc#cN@$1U3ItQUha7AAx=MKvuQ6G60xM89JW&sPNz+Q`Lq4dV5>w+TZvl_
zWu>WE;}q2d;1Y4v4XgE#OaNN3R!>iV+}Lo?xG?IlF{AFKqkJ*bv!M<sJ;un$(q@zK
zK79q-&){&(T~1as7?~|s%aYlfcmF55TbQCMR3nUs3LSxresfPQ2q>3-6Qrt?yD4a@
z5Lqf55TFD}(j_7)7+A~&>&@>1@R0#b{;{`bFen={9;9T2-^x%yAQkw0{Ja3F(t8tZ
zU*;_%x$It<)Ka&-YdkzeCC}q0tOCZZceuo_=O>-XR6YK!!QL<?hXWij=V~Bz6+nAr
zVJPFtni?gi9SrNhp@e1ID9od^WeRj-SAeK%_ZnAD<lqoCRMMV;Zf@SE4~;1?L$rut
zjTib|@~Yqq`c_k#o}Q?B;OjfmLX*(M1leyYUtkd2{%Apj0}$`O0k$86*qhf3*p{+V
z)RIK(5=;p=<&`JHB%kAwV1bP>%@u5Nqddk_!6T5kw<raVQ&P!mn-5E;49QbOKjy6V
zw%(?d_%~t|0KpSZtj^Du?VY>K>E!ta>u=O5Z^q!p_mA&WpUv{IetncXShzJ&;T=a<
z%KX%Ew-}_*$`luIBsKIEKZ~Y^t8yPsf)?%fP)7vASlPTiS9{+~0?-s2*_&oePycQ3
z@srTh{Uke?pELFwODvki?V5k;R?gWSdE3tWhs{4E*|DD~4H|2zLKqI@Hq&6%r>{To
zGmzVawEt@PdzF0id4pca#0Jjz9wc=u_RM$LT)oz1OkFhp<s0KIb!xHZq9P@q-*A)F
z2Yeit#k7}&%xcwWh~qyl6_?=JYa5HJ$`)QTP=yF)@rzY5q&{R|pFQKnwA-SUyaCn=
z2S6`MkU6uCGUg#hogI~Qhhz+*uI{pv?Q7HH{7O?g$%&IOvn1P#&pt<cgMI{Tyar(g
zpU=nmz6?v7xkWSDU^HIvvgfVxdB7gx@T5k9^+AWx-7&gog?K1Z74V3E(-ZawNd`pN
z3Hc<~Zm7I3+r;cZ-t}2mUI!I`+$}vgi{s5I1gL^aKfSH3)i^6W)<EGQ0&%65)K&zR
zO1TtfDDq@#ecEv}G&K1josnDb%eM0>$6vG4(A`*?Cf8DJr8NoA)nwb*b{LzPnUiIP
z4UaExNdgt;KxJc=JSboQ_nSJo<AlSio-qKZ^-o~C!b6wn%u>n%+S;M0(0*`ram~0t
zUw%R_9azf>eQ>Z&z0@tEH(a3~3Z-3jMv1eW_B)%_JCN%SmZs~kh@@Xd^Mbz$f|{DZ
zAp#-Zf0ZVKP?)vpxrP<cZw6+fhzB8~aW7PrAmHJNeTQD{1(o(G>^$gUW%1r2HmK`m
z>L8fRyviMGr;||Ypc!xXD?X&4W@#R#(E2g-alqB3#%DncK$tAHO^<V!ZA|CMoy;3`
zdRzaGcnz!5pijMF4+hBu!8m{^yfH#<Y$g1DNep>?+;@hk{Cx9kQ1+k4zp*f!m|U{n
zn$O+e7mXv}u(z|*douVVDTyI=h44j(`yo9ny0a+tK$9aCjH|*I{A?^Vud)dtH#g%W
zr71~*(4!bcfU*O9GyDNRS9KHCxj7Gs;}JmWLXi7e@)50@=7E6a!Tu76E@>V-u*Amp
zRY;$`f(B^z@lRtocMmV@3O_ZU-~?<HkCTg!nc_x$+vWRAFdV^8Uk?t(H$LO_9>TZ#
z2ZRJ4pvzLZS{SRi_-p;^)MSGf=Z!F55>cg^{F~Ifeu7oYBgXgwNs)b&;yyfv@DLd-
zQ|mDwMlwa_(_n)dK?^_}m<cq57j?<HdLG409nwpmElEDWKUrs8ES_+M56%)|P(O${
z02++6w8!rS??(~p6A|mBiE5sKslV_T)<^K`pxU`Wvj_2a<e2>-HRfWo9h}KMRwqe@
zXkCR0Bdxz4-*at};_lH|KQ_=$Pz2f<&rEC#S#^^-^}E<d!^`OZor3CYE~H^rA*8{G
zVg9@HiL6I56+dZ9BC?Lf_7$XZld7ZhN2IvH%(f=OQ>N5mba%lS=JJlMNK6G~Yz(|1
z+%E{jp9{i$rd<fGW^yXvW&V^#uL6#e4&-;Sj!fjs>jL9zUhXX<2c72(YFmI#1y3Vm
zZEWyTMK&SD>Gbfe&U~mf!7O9-p6mUKifB$wdJmG>N6N7O0L<JpOydK3A1)<QK=+|Z
zyrEPyi5J<8WIC^|v<ZO$%-IpVhDIsllKVS1K{m^pltctq<y;y7dSU@maMIX@MR%fI
zS91x|P@{Dd6ExAdXmE#(2o3JUy@%m!e2jQEMz;FBx@FuiiSebI-KQ0CE2WBqOq_Yy
z0*`H2oe)nEVr;BeO~*sT(dGSYlWNHo8*ho7meVKW%Moli<lm1E4R#|MtG16yMI<Hr
zmnt^95X^J^KIOVSlHH8?4c{tPdW$@^w)JQ(`-a&r2D>7=^~L7Jm`4uiH;hR<mZ=VT
zVj0El-*<S+HehK`o!nCX1n4oI@zAd&v4?>Ik2Q?Sv0hqx_#_odNLX>ay`pn@l0aHC
zt!zSP$e!GW=)I3NlgO1xNV!23i&SuO#WR(v<kg5_6!G(p=gw$oO_l}C8>Gs_$Ug-|
z?|D!!Srp1YBt6eZeXO|1ee-lX{Hhab;h`61`KIFRf_^O`*rzfJAw1NW_n;*2WPOFm
z8jy=0Js^mqiiIm~xq4+Y;rkkD{<TQ|IpQ^GO$-lSU54kY`?r;efq%}yETwYrCaUZ-
zPLPG%F}Y$CI+qC7rfL>!a!5mNdTZwxvL<s|rG~WWMTiXBi-9rFaW>my{Q+i9oLn^i
z;YGGX`D-_nDc>GqIG8nPzLW7{8A4fI)4za(y0s_e-kkL2ypxB)xY4`;^@+xK&A7C!
zaNJ$S%)fePOl@~CUFu%Z`rd0eG`nW|;;r`&;ohz0=rQ<-!QpB$3<0VZ6HvKlTrgi`
z2Xf!i(aEj5$H-{9nn=pjQ`r1jrr*Qk*5oGQ#-k*l$RL;$ZCtp`BDe!V%$K;WidD|v
zz=@juy5VqQTNoa0yQenJ5=R~>ZIRr5LBY4ZEo+0?^2^Yj1+!^g(u*N$^9LCO1cHxk
zvw))Plltuol2n4KNBW2SoOC^j>arTdR_m{g1tgJ?<9iH0#CU}zwUWsectLu*KUa+3
zZ|d1|!eN1a__b6acVN~|VH7|m&|N^3@0A%ae|5G>o;R*CNgfk7r>?88<KpAMq_!Z8
zmEH4C=wC^=84z<2=ozigtsYenJwE(&CDZc6=b8*>{mMmy<3z^f>g^#ZsgrYzs*M0x
zTe97g)Iw46z9lX4pE)L;WRZRqHdTMQuj1)?Y&-{-ysbXZwyvU+1r_h7b%gI8q@3a;
z0ui26gd(>0Mqe$u%b=7d#f?w0+i()!+1UYk6c;__Y()T$iwOa2;>>JIu+NoW<FAq>
z|HD2Bh!w-*E3S%(y%~8xp=DfL#dzG@MLaWrY=rNB&G4L-k+sD7_`{zC#I2&XW=Z_#
zLYER)jUwTfMJxj7qw?S^ai-TqZf^5s_VK0QEOf*6arkHTjsP5heH~AMh&)N#5zM|4
z+B#IXJiR`cu9bEo2SGSybw%ExpA}1{CiW|=NxBmyHbgbIUuBV=*JgOBmHrQYntLRJ
zCP}@RT2~d`Y;SMR&zq)5#m&vlfmk%Jq%6gx1b>8RiT&3*p*523Sx#BpQcn$=CHgBu
zZ$iaRKWgn(_RW}|>UP=Hb$=*x&uZLwJQgO)<S==JkA{jv;1bKAdJYYJUe_HJb!TbP
z{$Cx=7;oJI`;8kv9<61PecglD$I~}vA*fgAAI~gFzY%bB`JUmK#)&}b(q+v$;F<Zt
zVJ$*pSgW7IwCu1<1}+CNZ7H-OQ>`~D0>h>V`L5|m<;yl+u*Q5H_8Jbuc|82NVrqV-
z1-z_EChkvNY9B|P;|hc4sr)DSd#U*D%;Woa{l`nWl>&^(u3J|zCNbg8UEVuhj2v7o
z*U1M2D<B*NJo~?XrJ<stA|tCKuPp-#gYu9zCL)yXC-c7Y4Cm^}x;p9BP<?62uomOb
zT1kovAng7i3JB}>@9XSq0GSSUwfdL09svSMiI*H8Zf;zb(mC8JKt}*uxU~}=@v@#B
zo7u38Zp}k=*O&}EkTdxJi6|G>Rd$PHiwhDOvd6@ZBL@<;M#sl~LtkFY-RXeI>y>y<
zlp{RDq~1z6fc3>o*5@Gw?BKBRn0Ry7kahKRVj!p>+iSKo`uic5CK>!OSUj{5N8j6T
zdQd4DBu;gVu&zNB(zi+{rN#~p4uV!c8+x%nGXk~2EBzQ&J~&2_Mpj9ORcK&9JQia~
zF+<7NqYMZ31?YNgc+0bgFGcQ0iBKAX$xiBG#swF2?g|}Q(Je83=-^N?X60D<&BDS0
zv`WAs`$>Zf3?ewACD`A8-AQ~s4;%v}LT9eG>Ijqx^zZ%1nVvp=Op(Bpl2LauIvUPq
zm)NDIsfkAmSY7#0x;ks7n>Wkez4Mad6u%FQ(0A{)78$p!!B0=@WW&Id79Da~8*3Wh
z>QED}=6|?8M6KR4(!fGdyTBW02*HA7jSN6v?k9?FsBoQMJy@Q3D$T=&KTtb8g<LIP
zN+T<~pQ0f<T~~IWc<keFZmaXHjZ?f}eDwH>Twyqavl|Ts7?hArK`f_k003vXa@!89
zbU*bH|4XZzGEjN(M?QsQUnWAOew7K<)kEH6Vzh#sdcx&7Lv2Z)LJUP7@E(~V7((Zc
zR-l)`I`(}3A;#MrgAkhLknSGx3EY$Fc~MoI#z3=NS{zm}u;sTEm8=i%&q1AR@m9hi
z<?hvur$9aQ<dzD^KrBtxs@fkbE_SmcWiO@64DahceQt$Gg#AeafcQljhOUJ~vkpbW
zxWr~(YSO-(kh_%yp_?FPTz8gTNaV;W6NpSu%%aGWR8?0m&@RgdqJ7hnX<;$?eGPT>
zk>1`n;o*H>zkUIdg2KW=5-AiP56|<C81CrkXocQjRT{-6FO!ez1#7FToo#K?v(Wup
z%y-14g39hnhWDXm-IRG_?o|~OQgZQG(MWwiFS#jbOv}ZtzzTQu$C9u+>=;X~6Z%Cu
zaZP%kO?!4JM{Q_A+MQc8MFhp{b4#9CiXWr;bw8*PJnVi1a6NQe&CZp}#_mi|)=+;N
z^w!gNb83;_=c2{vxXpTM!X!*aeI;^9JAs6G_cQ1_xx};5reE4EN;r-H)oIQCOb-<g
z3cmLZN`g>ljk;wClNT-V-v3-v>?C!1!o?N1chxwag3Iq-Jdj$V*!XD4>uzS!>ga1$
zrR)i;(i@`^WQ@OwY)h^VvMxexHxrqJEw1pZ0H{^=^?|0p5`1=tIA8N|qZV<A=(E!k
zd#lTsFxKm)&I+Ae!x4tG*h}Bm7-9%Yo~VwM*&r~E9;SCC4jhC!)t(=3DlXpn_RYgL
z?B2aC^Dfadx!WPwe<HuF#>ug2aO+jYjZuH%FgH<1pB8@k@}=6xODDxbaT&R}!FfS+
z3=BXHa1QQDo{@%FT3MOJ3TZrqgoK2EcN&~+6(+uk>y}$kpkMYpNu<+HUJmXpm-&8o
z?1}bF>fW+oTLIyYzWb?_oq1zk6=Ll1WL3#jZKAw>y)wPz%zW@ivmI0(5KTVM(Fkd~
zUH{zEnvFE_Ao%j_DSxF#$~tTY{?XI0MIin;<FZ*I6B9L6irR!W1)moDrwW?im;UfO
zP}9_QJ1h_z2@JGu{y6weTeyV=2QHWuIWS$k4S}`vEgxDUp3ZfGUGi>fC219YQr_;7
zU!JHWBCK1Fz1xS_)k9QYvc`hIQQ#Xt=&7<eH0I<{_jwr@VJ&9<L<|AE3A%(HS{Ha5
z)^Hz-OJ<N<pMBT`0PkduQ(+#FkBrfS`-EiVu`;cp?JJx|06O6Sc+;rWhtYhr(&r7R
z!!TJ$*q}K%tAO7EwBpu6L31f)WDzrg!n~l-U2!Rb9_G>0>tE1Zz#<6w5aWa3a>NxE
z7q70aeyfBwlr|Yz*x0n#RO{K;+ZT`jn70lcMe;plQTeGSXuNv;dVM^gJj0@I@xTXG
zmb6WI-dY~4B|Ada>Z0wGCsGjy?`>LY=otxwLgutZflL*r=TeE0Gu#`frA1i?rO!9^
zQ~X9l`1w1&ttA5#ijM=QvW4xr2Je@WffDB}S}>P1<S5-KfcWNc`VSzIm{4>Dk+!}!
zd2|d%Jx~N3Xjs->EZi-txHwsCDDWLUJe|1nF4seUZ@9wKT#v1WFTzx-&IWqh+vAxt
zv;~+m^)IRs{B|dvAIRA}W|N!22JOBvG3JmdUoR&kUsk)XfI8OX^r(ov^z_Rv1JCI&
zLb^;q!s=ObHMBur98XOsYNvK$ES864*65~JFH2Eu-Oa#_>x6rSv^cUi9Nn19?Lb_w
z2yM{WpXb|+AmZ<sXI-&aKVCF~V|2Y5tLik0AD^tj*#nfVMp-^)6OpiJuQCV{P)fC!
zf_7Pjdd>S?>Nuhw&c54;>s@bCX}vt(r3nDXR6MucF(aTTPP}>JG(H%uzUjSrP$~8H
z<_hBr14mLA%@ZvxEj2YtxiuzYKjWbs)y$Ozxd5POZZiVY%Hrl4wD}=dETT@r@<-Ub
zwKk8UH;fhdBhowzR1oVHl<&L7hoTTO=Db};+^dcEG`#p2E#TgccpWO+Dam}{2o*9w
zeymQr4!Qx|kSHG?riB5CcIA}TUMHUWJeA()`6NRi99p}rWcw7O0Lmv|kH`J&1Do6y
zBvWPl)nx69<Eji$t4l5K&zO(2E~~In4@<egnV#JN*%LZW#DXG)W{AGW@?sMEG;AA?
z)ax}Tihh;Qku+ngG{aB)&P#hCmu9{<AaxcitC}|WlS6Jjj6Q3~8UiWd2_0FnWCh|v
zYaxmIhY(xKXWF{9bxz1BAHlV1?U~nxk;Ffqye~p^6D80Ck(jqDSX`~fJS0INj8(|~
zc}xXwYr;gq;4@J@IZXbXZ`M2YM<<Le{^)JSh`eufwEV@=qm->=<+Z5j=sIDZ>?-SV
z|64#~qvJw7`{L^Ttr6rCb#<xDR+jxQJdPK22+eaX!+#NMK+Z4G*FQ9e3IfvZ!-^QB
zWR-xS=I)iZx^uAr1Uvbh(cMN)5r_uwfeOiL_3<0*%9h`-mIm#_tEvwl_N+(OFNVIr
zUWKCW-^26;Cy#_4P)g8S(EWIFwFnX3w8{iX;G)WDaOPG`X~vw-z}U4y;kZeo_)7b^
z2CzigGF>r%3vNW+F*v&;d(Xa`<(nJ5y}j~`?wGWKrk#ZvQ_St1J^3>u>b^EB%gYKZ
zoN@C-1<f4RAlp__l{g>ySWWFZOoH-a(25x1<%sTj(Wg%@3=D1^zFY!WACPEY3+FA4
zSIb;sRRCIVU^G(FEPd^R2yAPtxJ&{1kY@nr%?5ba(3?KAfV9i+-E8YMAb9#_d`9Mh
zitl3g;dEx-Py$+~=KtZ{Z<y1OgB%Gp5rX&wABeHYK_B+N_2h3!Rt2eyz0g{<fp5oV
zFeBSD4dO9cE_&nik{6nEj3g)+Iht_jNz3Kebyck}NPBwMn^y#hM9Rax3#4v_<gPK6
zS`1tlR_`#g$h72;UmstzuV4U(tEcCAO1j3Bp%`v$I$|6*OB!nXo5+a@ic9?(t>rZW
z_(`1;I0Nde?qh<=347ygTS7Xa@>Mkg-*qG6`2@wt9k}l%nsq2u-1t6k?5BvZGusb*
zi{o~f^yK4QMFcBmL9Z9akot+I+N286uk9xdcsBlx%n^fSv}h<H(DI@I&*GCmQYueH
zd)cPzDKI||0(_S1rY7~*FJ7h*duqFm|6&#t7NZ>%xYvHmg1{Wvh7>2#-)Y{7R$2T$
zZ<?={G&!*KR8Nmxh(W|z1AWC^ojMnYF3o%Sa!Vm7$?aFtWLnm$wnE>&y#%^eA<=8s
zdNyXst-hs1b29pd{p4W?UPo|%Iuj@W*?D3)B_&}KCMXnF02c*Umr#!V)k&AN4X4{O
zGS|rYfav|Q083~l*qe18e-#p1zhT9DH&MNcmzGs~8pXa1@t*v+GN6yNKbhO{-60TM
z;|~c;J8YLK)Jfz}>j@5RoQpqT|0(pp%s$N<xA<9;1O2)H<ZXJI9VO6q7i9gzP8{7l
zxsXP-FCUy7FGff2SUF?*rDtSkd(Gv1IKaBv0KP;KvRi#1dx`Y!K|AM--tgwIeMc=C
z9@XE*g+4`D7t@i-($^bcw5}Eo-<^2EG&kpZ{})u52aYUh%0_K;g*sgX>l<2Kztbi3
zwe!|&BT!aZ1%@WGo7XCg%qVK|U<-Ky`vj5xbj!p<XP&l$MYgmkn#)YHX<Xs7^r=OK
z3W0l`&7u6q1n~B5n-7{{2FjUG%w98wG8s-q3|DNNg1d0jJ6}u|7b1sMaQ-3o`xPy}
zZ*2SeDu#F8%?qbQi`#u__@fZa+dzLdr2J0);UdF7pZ=q$TBMEbxZ!bv%E`%j`}VCt
zO2<pEApGBIu>PVp|L?d`QZ#R>LbIwuamJ_O{ItK3{}gF40UT~yrG@HreyLiBkC4$g
zOJ0lWgvn#6o|Y$$3Ag^{Z&CiQ-+#;Y`8Ih+U)OX<c^svu>n=3uyC3Yv7^Khxu@)#^
zeu6A=3IXQ;I{jN9f<`ZV4L#}r5>omjTf+5&O?%hPr@Z$RPN(<CJ6Q^t+>Z*k(nEQ7
z;oGwYDXGkhwtK^jrxX(homdEx-3n447ceFlfP%UI|51}=w0@wuf<P7KcU^)=^l!}M
zXoi;^C`9fwyN1x~yMZe`ob|}Derfn#Nv{5zwsqLLrlatuq0h$~s~M7!1xvb1+E9<R
ziqRr>iH-{ybH|6NFAY)oW{iGeiF?97PTp6~q{K7`gW$Sh-qIn)=O`3nLjfRPuTOc1
z2rC^?Z?Bd2LMu>H;Vilgqf&G8iLlEO12r?u7qy+#Y-y;D{Pd2pUoJyyj)xsM$7}Wl
zts2UPR@B7F83{%%jGrbPeOKb#Ymfya)A;~SQZ`*Wh_cFnvv%eup@<FH;t=s)1l+GI
z4GfF~K>^JmAV7J4;n{P^eC3vN$%^{U8O=Y|0{n3w-n-Y?t%pq;h!DXLMCnmHN6!Ke
z*Fg~11=<&{BxQEt=Wbe8)c=K>yyj_~HV2`XNh6TE(N6%G@JB=ahtU4gbmcmM+%ANE
zZBjl6kzl{|5NW;7|CJ~uHG#cdfjwWU6WOB|vELphpWLoN`i6<c#4wOmtjr4BZr6ht
z>m5IR8*@(A<-U~n!6=GU3m4eZ?a?->3$FNrkFPt)8YkUb-EJ+7(3v<HeJPx^TpMk3
z(#`rRaUIm?062C%DlC~wlA*?f-UN$^{>Hdd8^Mq0{qQ!fQfqwkN8>-Cz@83XFTM+&
z2lP&g4q}y^^V$FQSHJ-KJvh_U<NNwG;Px8+Ee1m?A~1%CII*l)uQpnb$FPP!uz*L#
zs%rr?2aIQG0u(6yS$8IYYW}wx56xTG)!iB}Uw8@BQN)bTTl#$MAO(#f1}1Rt`%fn>
z$G_ffe}EyQUUT3I?v08;JE~&p2txZcaD@axE0J#TpmZtWoL5+n<XcRH-eRRr-wPuE
zxpd~F-qjL^3QJ8BG<XqT%9H)<^I8Gm>Pv>ql3@a5%2Bm;DefX<pZi+!ODy#q3N3vc
zTx}oR(0Q;MaYqD`E`eLhXIk*`eT=M)+Nwo>?(sM+kbd`actvhN`*Z-1AozF&9tQ)|
zHYX>7BG#ikuFERr@n$@`&Kv}&V?WZf5j+7w8hQ8s&9wIfdR-r%U7}~Gp<$HryY@<g
ziWhdR6-(BTMuZLp(l5Cgwpyl=^pA7S!M%pBa$8G39>_O9s7s*rQWqHb=|5g6{Hfpb
zgxNs%oNKPnFi7*LNffpB#PN46Z1eb~KON$L*yv8v-ImD$mR<Yb<p$ca3zLU<dpZdy
zOTG<j5}?vz5-$S;cSbY?86TLC*-u74F+HJO)u!~bKT;}9kV}6uO2-3xcIkN#H{pE!
zZ9X<R(g2Fn8OsKbBN9`YbrVd%rc}}z6Cp{I%D#=t9Pa&EF0KgNXv!lefVCq}oz}V#
zWhvEzWypn6`5EIepU+@gRxuAWWWr2(Tux0`qcHY3L><-YAFzB=3l^{MXnf$!>D5S4
zq*6XbeB>|RR=)Qfb(dg+6k8e?8La0LZpiiu&B~ki?g57i>n-U>dWsUduP(dmni?7^
zk2WdaLn~c1csd`cK1I~HldSrg7#cDIkp%ezpIV1`Amd{{lTx8IUpSmJ&t<H=o_R@F
zd!lt1zw9cP2(20z)NtqPxg3wcX?D-m=G=E9(~27qF){k?sEsyYS1Wu-aLo2^9khiR
zU+)X<s86M(KBCuS!j&(u9MS95#pPqcr-TWrD|(@#(;2$t4&{7Q?PSu^;O>nS0-fBc
zgBPcPa-eBlzD{P#0u#GkNX$7f56WVD6ITweXdtMYo1(LmJjy!(Z(ufK+F~n+6fB}l
z%~#S?>IEA)jU0yGlLc`wz%$>I`6M`rCm$Xjf)j*7qf<YB-U+!mX<PY?jEoE*p+Hvt
z3T}KYrVnXQ8T>Hoe`^pxC+#Hd-8TA|?fTZ(J-(3AK4i_2>syQqJ)bVpqymKr+DYPl
zDR4~brvq>3O+kcL&bQIg(iclxYL+<-DqhhvX!A5_WS#JPuxNtA;4SW67rx(n>1JZA
zCYJ+fg4fr9N?a2#m3T)|C|f^!%VW}iR)DMN2)uR!rc$}yxacTxb^TMkRXn1ki1-Sh
z!yt4oU8qOsf-jsFkS8xVTwJ%oTjOY9prMQ#NFvNkkb#$i{SvQ)IZn^^ViXgFzL92?
zk0517J{fOFk5>?<ln7K9u1LCCH{kmq(SEC}&tW<NQW(;Kopufk<Sm9O6)VxS0GS|8
zib!xY0>Ieki(q{8#4S0hAA#c_A|hgH)-1){R5)>be7uF@A#m_AeE&MWH<kRqwU{!3
zL`uQAK#f`GNf{u`<j6Jpio$8D>k3U6TKt9vVmx62FWl3Jjch1&IRa_tp<@4G$T_F`
zbTP7}S(5BbkjcooAZX-o!Gxmc3vpbHzly*64Xu3py>$K>IK@l8YWVFf2bzWFt9v3@
zJ!AsqnS)oMd~vb{l)d_eY?nWgkbkYux=P-O@)APfw{W?<rkCXlY8RrZ+DeZ|c!1F`
z)0$dDs&Tb+SRm*ejU@*)hC8ViYcPY6BOcHB9SZE7DHT2=+z|3fw$it~jbxVWUU9fQ
zv6dh}aCz{<#>Pf@*5}VRczAd~GSszKHvDebrTo<I`9m6GcxcLmct5#$d%vNFdvP&@
zwE#cOna^jFoos&?_=mV+GBdA><EYhZ#4@29cvSHRHyW$rv}5|qmuFJ&_=zMbi!GQ$
z)~=Y3Wq`BIM@M%zJBHRtIVD1Z%B9i5IiNe<T|bOg`XwP{JP-Zj_Zc<9nmGg>P|T0)
zAJ@O|c>g8U0-YUDaTGg-z7Fe;f@Sa*BkGtE*l!{}J)I*TyxAFm8zT`@NCLLTHw*|e
zVhc&fq+@+XhPt_!g9`OU=xBdOc?urW?k3mU*e4z+rGk_EOM#cT+PH{Zd(=<*U*}>D
z>>v;BJ6Viu7k;p<mh^tAEI6&1Mrf2#WDN_a7rElPB?kXN1-Pw&hTCzdEB<yHF4dO9
zxfai#3nz9xFN1J%TZ1(W9Ky4Xdx?Pow)wry-yioAx7x>{Y1^pSSgMN`M}O>o`671L
zh9*F$qPyPzf%kDy5sTiCX^E;@C}zRX@Gyg@J!YBb#Kc5kApS`%uwPA$gR^vALZPQS
z1s<;@_7EO$jjJ22E6~O@fM%}8jr7}JU?9T+#&#J8IOwfco7&Fr(^E)@<p6i;5YW^X
z))X-j4byyl`MrB^=rgX%VEK9H$B&WYh12XpSpXOs^`LyBo%8;0LEO(#HWH}gu)7b>
z@0hwghI~=(s(xZ$;3R<_?Zy}hhP@%h%Se_{c8%{Eh08>4t~T49UxZ_8uiT>&;F%Gu
zxaop@u>*%O?Jl{A3CFWsCpux)m9_W*H|(9b{FKqT5vjWLCYyT-9xS;0XK4A>;D^Ux
zLJkcLtqkXV%**>cW>p@dh$A^XGz3!UftLuZ(>!`rB3xXn4J1;*K|$liz$)<c8if#W
zJUuRA8e1Ov2!TKx?CtmW=_QqM!Iv(9cKTjaRMd;L3DME)oE)!UV_nd*|A&@FcyM;<
zvqp0LPbbu<iMl%D6sz*A^irS7nbH`Dum00DpGS6;!p#JnVi<R4g*8sjb?{8cAvJ#p
zZ6pF+;BQ$W06=zdF!=A^?S)*0r?Y{bI<xOh40mYQY1e?Og~4S>MUq4sL5WRyJZL)U
z#(|y#Kd6nCzn=Kl2^0;m<Pmt(7T8+R=r%3@dTyF@e{v(QRDM1BJBstmddO_HXX=|^
zS2$HIO@jiWR<uCtqW<`@S32&zg7xJ1;Qj%XQ#J1CJE$asU-ha^VYvX~pf|rfj5}X}
z^yY<j8dEbqVVsa)3!#b3{QT}`pX>EwuuA}tRP-_&7yxmwv8SL)?}WA??(XhYE#V+c
zU|gT=8F(;pSW3-59K&wNJ^-D2U*4^c)qkv8rVbb7rUJ8`$mqF+5dR;v%XSvxCn^+(
z9gKhEx(c_MO@QD9E$&iYy!FHBrwC|Mkh*DwQ$}{YN|Y0Xw#<74>gc?sy7bc3EDOLY
z7~kvx&Y*wov8|3;9l~i=SK6PH8S>Za`cv(Vj!n2{wMqfNDp9&?bv2cTjZ-jrf^{41
z*rHJa_gKb>N8I#ocSs|NpYT0>`gCk;tP+a!wGL{kALiS2scvg)dtR1UCTaw45DK|t
zA*mYr2{GN%xL}6`s7E)&%FES)zZ!7^+T_LEADqn_m)tZ-90^nJy{3H2{oRnp`fuTo
z27xa*?mNr0PkkEME~wUGoFxw;86ZRXZ_J<$7m1b*A(%Vr7l6R=-8=l4kD2pSZxm1j
zuXJD3J%Y^4f(qX1Ym}@#S`7cEQvr{}Ti`OA2v4Z}I`eIx)i(${F|SKW81rpETp8a&
zjsm_rgHB?_b}xo*f7^V-dF(TpG$xghKzUM3OiXn2>u?BeO<ntoV|<8FkC#I+P{3*?
zz|n#xadvh*;`UuoP`n4C*cNwsPmz$80a_X{S+VpxZR@qvmv%_DX)rILtsm&1fD0d!
z(+NK7hYM%*UsHOId7JW|i)05F1}J4`W`Wur0<F2KDAS5{kd*3Ol5)Xm>mrHkC9yC#
z=RT>=u7#DA&X*S-oScmZZd3rI4|}{kb+go(<$~;#=4ep<*YWXdXpHmw+*}>rTM5^g
zu7%sqKAcDURh0kMC;c-keo3?68sIgC$iP?^fwBTfYm7L>a1@R=q^7;V+xgmuA&$1%
z?f~N|6!-XlAd+0BCGjb`O$7yn2(T{2%gGWRJ<II}ZnN)?=cXdWz$BW18as<KxOuAE
z+1WjLG6+~zkn7vu-@otAa2{AWLHYnXOy_@w5;T6VU)hn|n{Z&WR2b1y<<T&w?w=y)
z250N^A*T}a6j*|keXPHJ3h0X8)J#}PXeUDnf6>YJV{kDyFnEQ9gIDv~!TkAiQ`w@=
zG4Ay!#a1awtfNfwNbfXgkR6au_8WvHFkJxR>)#$Tfc}641hh3cuw#zKa+!ieO6tLt
z4$KFFAsozjmVz7aetts!#xMB~BU&m5f*uBG$ULRt2AI-|(RL%E#FRo(LF-frDH$|E
zM)&^}U;v^I_!g;xnQ*I(Y=6nZ6ZHaf0hW^OD|UvDPPkprm!@6iypA@U{Yw>|kRl7<
zBfAP6;B+m>$!TwE!`rRU0jE~7{)(P@cz7xD&Buy^a(j9VfU0HAL-*@_Sd*RDSj}D+
zL%T*n*H83U*U!!#075fCIE8eb;>k|2{j_}&JKN~B$*_XYs%tgWe;HIBwIF*9Qo+Ui
zj>phLrVflf6_#&m?DPuL)C?Q5`p{nywFOyM(0;V3_Wb$0F0|`1UDYE^O%m4O$r|}N
zhy~qGF(6KkwuZ*tNB!cXia*s5S7P76%uY~4Gsjtq$HJ1ZP>!-*0O|jc&%*P<hp{p2
zE`ubn((z0&x@-8~_|E|2JIc6LFtwk#P28LvRAI%n6yMg@M+)ntIPnv#!m^+sx8U>@
z&VS|>K(1VQVk>gSecVG&j;8Aq-e2Sc{8Z2E|L4QvZkKVOd(}ujN8g_^aN@xaYinC>
zJ9QFXI=L=J(JB1KU(1<ALqe<{#7f$H&U_T6_1B^UvK7*k>T1u5YCPIxV6QtlX~>z7
zVgWPjT=GQCp83duyHuG3IpfE9yIFlV>wTNk=?`4EyF^U>Odv+f)6uOf;fi^}28+-l
zh-I3kNKI`E4bUXEmT=v_!-DZ&^e#~mpm(RJBT%7-=m!jp)oMf&zF#p_v0cAjK*JTw
zwx8Zg97*QT{hPJUZNCW;vG|2nHnz36lz?sit@r_LQT8<xpMU1vPrnD-xkQ;oqkcEo
z99Yh-1F)waA>5>xQo1tM(sIAV*=8U-e+yE=sKobCNT%jwKV=i<&gXh>hEpoWBV0Y8
z<5%CfC)Z>CH6pJ&n%pN7@Cyc&I@;P7!aRTl+kR8<_*iB?s`ciWm*2gb@TM_3HcLKg
zSNtXuNeJHR$0OuvU6HYej}B#r4tS~P{&aEi%71rpUGftK8NNvi-Smwng+%PYjF=?u
zOcMcUWTUCDa{=n;#rRj6#8n{J1KLYxwx}ocU|zO$`<BieHuUi7YQ)70sL@2rXaQXE
zMCBflm77UV!X@X#)wu^cq3`qpm_CJpa%pzyMq=!S5tg^1%7nqu_{d&cDE$#;0ygY4
zE#8+^PK^r*?WK&risM<BF~7I3AO{xoGfw4}6!n)HWgI&l3SEv}n|Q?1>bBJ+N)EgY
zF=Uqc<HzS;M5ie+&>r1!a>Ct$xJ^uekSx;N_iiq?0Wke%Qv=?$+`3zs+5)2n(m7{t
z`p6v{n8wI_L>%^OX6~Sc?#a{sup#k`Ko6~C0526&@wkir!YWrk$a_Q>sVo@b-G1CU
z2el3RRuhx=y*;b$I_+14rzi6XHOat~F>@le*DQ}$j5`<HTMzI4WF^$nep3QSYhVX+
z&$3E_0zfd)?}i0;=BozHu!Qe29$*?*w)Iw$Mbj?Stk)A~Yl`4K&|%CtR#fP;ANC~K
zeK;ib&(?#spUoIBd0p^7^al!EoKZE8@{KJIDc27qr2u(0oj8)PbL;x8BegV%Z&)&X
z42VJq@3Exh)N5W$@>&lduhoUXo_{I*64E1Mc6CB`t0Wo-UGkl~_=lJ1IkY;i-_!*d
z8f?8aoQLG1<UiQ!uC-s~Uf)bvaqQLl3RR&ba}<i;F1Fd`79;oOk}{uH^A|onaeaju
zds2}FcX=iHrEQ&qdF1-W<RbO@o?nSs&F1Vjo*waJOz4l545euJ){F;QP-(fkKhsfG
zR{MFd^a~&L#1JS>qj2S~;8Oo)8GqXNrpQzbtobB__Bhog-zHTPrkn#h4?b%6I*ud5
z`>a4vjsDf8s9pJAT}rU?UI)?une=Y?3E0Pe?-C$nqK<~}PPTXU_3zOBvbB>52bIFd
zq&;5bhPgsO8$0ee_zr(MAc<j2EdnykO!_v-C;4fvAZqgmclk!qik0Hf`EB-K(9CQD
zgmqCMuHax|pd*DO#h3M@X++^&yymQ4=+f6af*T)<__{H9e<8VODXoXY)RTiY9Z96P
zR0aXkTE#SQ%FX;MRhm0G_oAEs^juq5c;OrE(q@k96bnW`W3(TB88&mT5ScHD-u?v+
zlNuYdfVBvnu>h;Ts6T?)hRH|`J4jkz51%H*KjYqhGY1hP$ZV<0{_C7QSVy7DjNV7|
z!9w7k4hU8nM9lCT9>Jn)Z*D=dF9Qs|3xLmeHmd;MFP)GR6IpQlT|Qfuj~ENzd3X9w
zu!Qt6vwu)HZj51S-r{JD-l`FJ>0!1Hb^y*yoOOO7#vmEb6?U>KZ)<LvFe$mp<+RWn
zm1t9M0%-0!#i$!I)EdSQ!^riCC$BlqbBX&U;pNkEOF9H})G4z}G(UbV1bS}U9EEe!
z)SG5rdvEk3sEc)IE_<!KHsKX7#qGwik>nm%FlYS#=X%_ie<&=q60Hssw$XcGmW2~L
z8VQRgv137R1waXa4}9OhY>@h}3Ywe%r~OFPPhE*6GqVc@JlU;3W5Q&P?JK%(l}%(J
zrq$a_M9zG-7IS}RK>o5oJ{*xdWphH8KdC@M>8{G<2}InI3rG73uL9H91h084c}RV?
z`Y45Rkx4YjI!9<Roh8k0n?@INd>b)`INg{?gxi2uJdAtM`!=TVe)@hCm($moXuIq7
zqHM1)sl3u|c?rgr(>+m<RJwFbod2yM1@G<nv{c3Pg@b{bV^-~l=|!?!V|T=9q|H5n
zO`$YGGTq`Q^JhT&wb%V9(9R9ZPodEGT$j+NDJy1@0rOeCGwY=TleojoTVhxgmJD9w
zjBg&H7h@Vmp~J$1tI#O4k><aWB)rJbNfKIsejSvjltlh#mIOM143{W2hvXkqd0Tmn
zJ=jwoj(io?a+beA2t<z)gsHCSW?Z3($J?jnv3pGBs#k~4QJPcLDAb9ymq+R=02RZh
zsX*}=n{<Z`Ou-UTNr~jnJA>H1)&8v3A+Dm?x?mE|<q;)U)KcElC(dE3<H{NLD2{IW
zmP$-v_7?ChfGU~_X^Qle0K`ktvw}!g+zJqWdKXB<LLp=_e{v~QzL=N~FHt`NEDqX{
zA@5b2!@5dMS9N8hehe>0A^WemFA#@e@r>_HCjLht?jA62Yyo;-RALqt6}RbO?F;ui
z0ZNU1{8sbfOe5vIlCe&%Vp55CbxqNrn@XhO7lGfq&CE>Bn-AWK#E{pqvw~vSSZ{6(
z--Q!0kTj;`q?&BBLqI2w3`vgPQNGaOC4I}7{i;{52s+CQfu6JIR>k$N%?^@IbW`&3
z-E1JFVX#vUXv7zAyO3!=I@36_qrX3OF(CcOO|`=58+A%8^S<gEcds1t+;`2@bm9~z
zl%VRzGUKrm&Fr&%5110v2mJYh>G-?w@}$W^MTKX7m-3+L{0&inQvSrq1dgT$i=cO|
zHgy|Cx^pBb;-ZN2Z<IK#S-BC3zIO(Q|0)$1wYS<kQ10j*>vHn!tqdR)ND>bLi`Rmm
zzajb*&#>fOX|9ey3^H!&i=b^i;8Q|wq}_89-M_-^sOEH02Z=2?L*H;I2{?CBV!ipL
zy~=^Q`iAg`6|j!JpG)v@XTjzEV+4_Sb{3}gD{dBqsnzQPRMf4eRHra5s7Kv>tNV79
z0-@W=faOoUx8be|*Q-We`h^g@L*lyl_16j}$EFH`!*AJBVln()RW`P?;;LN;Ok!}B
z9~}ggE!I}wRi2iGLGC)#XCNJI*I&EDE82*l2-r!)yL|-G!p@@Me!3mNVEx_q7<TFx
zf<MWZX|Wxn-xravS|&+1%LjH3X@du#iySKb66%=LB1F;|LU|{OZ>!j8ZB;dB#!!aO
z8?UY`$5`}L&#lBR*F+krQ?H~R-W1LDg5FM#pmkWw!n<y-72Si5Y6sIh_mwuub%^yK
zaAjhhR$qirtwk-h>4Ym(@6cT;ZvyEM{jP`yhoml%hESP2x0kEj7`J22LB-4~?#^$T
z7#&{%!f?=DTXZ5NAlq>X53vL<^b3}_+5a?Du*>-}MZNk>SfgD5e-YMa>^w_PkHE$K
z9Ci?exGrAdj`VgglL{BNQTuKUs^SQY6yQCO3a84a@L994w*IlSR5WJgb<7j{F}1eE
zb%TS0<6U}&*yT>-TTRi26$8&L#eYbCJKAm*`i!JXJNHM<E++9Tc16KqE1<LUT7Q8Q
z>vfQO`x_<y6KAKrF7Og<CiXKZR8QNQder{u6~FW?g2b4Z;laVHqe%OT`JNPBMGpXK
zgW#@!3&%T<l>ulV^y}thO4A9Jq!lNJtSxxXr7zso$u%U7WsXqP%U;)OLPqVzngoxm
z1MumC(AdR)n1@|EfYbsX-;qJ$aQ>a_)<gW>uP)1gq<MLk00L5y_1x@;s>{{>YKQ(a
z8aaP{ezsln>aYHEcl%%cDYH|vu9ZL2r(2emoTwyzV{Gi!voNwX*&xadykB`_=_l7D
zB^d@3lEhS3bucr|Gla1U30*91P>;j97TRRwyfdVJ5M@L|OB>6h`|Pt8J+a@th+0&4
z>fZg>Enc8<EHQfS7dCr%QTDhSsJOrQos9I>jyP~5h(hA}55}BFyF1-&A2&U_Ml25&
zBy>i~_8XwqiHMl5bG&_fqSV@%yaMDO|Lp!!VoI5=J^ju)GXj_FMM!;rQ6p1y8_o0l
zUQPh>TmbF;??e_59%Rn+;Qk~?AE{c&r}p+%2l<cFjew8Ib~{V<1Ckvwp4gQ()|3~O
z-%LxWPrm-xwH`0Ui^50getxD3en0}3i;Ihcg9A8_%fe!*w)UJ`Vq&7UiVCxe*3aO%
zf&Ts~y@CPJ{{A|3kR7y6<p9w1Kr;(wz+M;?0qbgtxHI{W2uzUEBKAk$?hg(=vfFW!
zKoMDW>R)Vd#78L>WFeQ=Z#vAR13UHBP*cYrJN1~|fDFs?ODTkuCoT;x!OS!u0@`)0
zjQeFb2&ZNbln2QUV80>T_{Uj|m|gmvNG(k84v6y>-$0=$3%Vq0WZyv%(%8~d7cxCv
z0FnXcACCbIngkyo0IN@7IBd(wKz@#2t4{onVg|&a{K}V+^w=nxdqq@J1isO{$k(sH
z=t0A-0d4bwQ63px_X~HKrdOt10``A3{1vULKXqqVo)X|S1#)LVR)~qwlzq&}5fBuN
zjEn>@Q<5U<Dlhn9V|%5u&K+g&S<6rgpC+KckpFeRX%`LkZfk4n)R`Rpe0F3gg9Jyo
zFW|-gI|v&KWJvvg48j(jUDlNF_{gLMtOaGAI&t856A>6q?y1rXurH%Y3<DcVv^+gE
zHi+fJ-+lh>8$;UN!rOdQrog=G_~M0(4ITU$$SS0BQG&_4|FyNs{il-rqR<661x1NC
zaQ9ob6%u*>ib(_l<?%A4=W%A={6jhxkm3QpgcOrs^Oj~qy_s@mYB*qg05V)?{aFi$
z5;bT#>wO5G(dfv@zY{W7;HG@w>I(7LE*eG{8eAr0WV{-37MU&A_5U(5`@G(;LGhbH
zZ39~TMw1!;G>jqn`@=sd{FcFZdZjb2dECG~4O#>~*Pqhx2C=d;$2pd3X%&KOyIG}0
zBku#TJl=~ibeztAf&;9E0W7{6cv44WQoGx30XhEjX6!&N_d6%3Z4uhr`aeY5*~b48
zZSNBP6m2^Ydb+;^IcXw}t6^_CjN%_`m4M<Bt!>1jq!%=9Cak1S4qvvFy9DbP7M*V}
z!-TaoT7cQ)mh;vQbjb<|9v-J4#HuAJ{x#^$`AkefPQB~bALTe*1c5;RowEI7v~1Sc
zGqcEzlNIEHPgNM9k%qW(YD>AeT1x&Xy}q&5PG{stMi?rka3k;ZXx<<th>;L>91?!k
zgqKxjfynIO_PRCxK-%G;wZG5@$oTU;oW4H6G{!&h14?-jzz<Wmlq4(2<W<Q;77Jji
zw;phD;`}u}bLha}5@%{&wLd}BiFD@Kj;o`ry+Z4iro!@uTx^I7h8Vs-CFSz8`w|!L
z=K$Co<V<p<hVzHqdF2ZZT{&4blbX$v28J+IHMN9%eM9*vXqBFgiOE%9R*azphwtX+
z>y<s%)zk!M>(H_~iP|5~Pk;<-^U2b=86k6RU>P9GbqoG)dGrN4mLJg38Axyz#RFiB
z0?j5S*k{`(_EYL4gKqH^BdCdRs-9;@WEeK20Pj7>-{w28N3*VyTLiqTwjj}lJf@sL
zA_gH{fT_DD&v1`{q2>_i5<%c~)#WQ_n&z+O`z+xx=6GQMxRQtcZCLq;O8Kq2`LMn4
z>)o0+A&b$e->Y5AuCpQ>Hoq^$>gW4j5BZ6tpx-7oE-o&1_Kk`v{`Mpg9!W_>brbxY
z@qBT2KxCd?vj}}a|Alp+rkq}f&eTD!R~(0qep`o`<sJ7?aO>)J!*O$oGE;!Uzk!n?
z<|%KzY!tv18ui+QzL57uv{I?ivTwsZ!{fIm)n!f*t<6z$r*x|h8;QNz*B^I=SHb*3
z-0}Mj_C}x4FU<@CRy3FBpFHmc)rY-Ef7i`S%Vzlsswwh+3X-b@2(n7rN#w1>zJB}q
zrB!kbS_PY;PTh*l>DASzBBW^JMBBdVOz-|aT`bej0~#W+bjYZMg{{B7I6dCm+uj}=
z92B~7W4ivPxixs`R#v$dCNHMpBkF&$D4{?vukGTT+pl1p{dRr*6kc8FX}v7AeGXRe
zL*yio0-}gpM8R~=nIw)u6>oK8_x_EzSEb`x%EHB|j;c%A_rGM)XsOb?5wj5Ly7f%;
z3ze4jw=35*;zoyaFzzS&4yv1%MX24m@WnCUp!nYUo*7+>KJw~<9n_4qS@>l4Gz<Tm
z&RRuFP1hXVse5<Dv&wI<vm=&hciz0Dp}khKNgSQ;<|oH7p};x6TQBy)>(Wi__Utn8
z;ltub+*y82F*8@=kZ%O7D$IDtQVEXNJNiEuj-4E{oqt_^qUd?wxTeHqXZC%Wf|(Y7
z8BWfYU)jK#-Jqe0lzl`y^qt&F_RvaOitu>6j{diIuidiZm=JByYxXP|oQSj)x-GbM
zYR62kozTP~#HG6E^A5ilqf_^zx9>OUPt560xaPoL&G@+CoWln@RqJCV30X$%{6`=3
zoaZjBF)zNbFfq*xh<C0Pb=MK=f-cVX#K+szCz3FT-}T$=gbJGqw`&5c-IE#_89LK>
zl^d+(g38-t+IW(}L6Lx3vq&8%-SX_Bw^YD()gMvObs*9WWp#9a`p8jcoWnZOH%6<{
zLe_Q9rQ*?M--n==A<KD5EA!xMpIf?TLLUsWeDcNa*>gk2zR%B-Do;T<ZRPCjr;)^z
z@wm($FU-3oi%nen4$+KkcJsp)%ggJ~ceRe~m~&ssC~IyeENU3HK*Vi3>AP(NM03XW
zKYIVT+D1QCQQdVZBKbwY(Gj#K9=V?4RiBpr(#0j&ZN2W4@3I002C~n_?106Esrn7_
zBE@&eLH?cR?}hRU6flyKAmd_U!UL*%_hzM8Nm{_awU>UjW+J-8ElX3nw&s}e2(~;Z
z9}@C1DM@>w6MC0+c>IX))D`bJY?!kEW!n^Zy+5vR!Mf{ULjEqncgQ(qe>pk1ci4TM
z0f{NV$RLn4a1SFjs;9X#e=2gHFn9k^2XxUpe_n^kEEVY<>mFfM@a>ZkBAi~xaLMAd
zskg{50V$F<X?7D1bKa*ZGgmxKhK4d8zT*5`36~jvpVlF`MWvYeV7Za!v&>tF^Tw?`
zCih^mKJY-2q}(y56@9@xNr>8_|Lh{+5pS~mz0Kj<H=M;sEhsME{O4^Rl7~P&UV&?B
zYio|2gyj#`)+)6cZU-sBvXXYIjlF66KddbsAdOpm7sK8|t6ldfH2MV6{~EJFt@P1M
zudnb^TYbc<Z);b^6NU36>`fSxtYlMG?^+gVB_-aJ>c~56?i3b$Z#Q#)XAk-OS-;#3
zgU%mzSw(e3Wx;gE;h)TKNF>8-SsmUW&)p&ma)+s%+~r0D^jPdn@@of3G-sEwab#=|
zG)oB{jB8^b^+$i<(kXl7>&9yNs2e)N<US9PtmN3eT->s>-RP0+j1r85)hL#o3+rmX
z@Ey{Gd7xOs8BWXYp~X@?siFD7U!U(Jz9+f5_O<jI(N^fWRR3tL8jL2Q1I1yJck)EC
zBnOK8(lwFD``_hmTVl4|uE97b8!TChs((#@rwDJq{vGm$3(QAvh8yqC{r5-TQ0Ua8
zqt+&M;FN5H`H17whVC~=6Ph0?80b&i-TL#zAL`%+=gw*Ry5U5;26xh%?i<C7U?*;c
zZt4b1+%*qj`oDU+?x?1=X3w>71q1~I=^_bLl-@*YK&hccx*!mGhtQjXh=PCtLkmTk
zA|SoDC`Ed)P$ME$Ktdp)MId|!{JwkddvCq<{(G#IKR7EnduH~`Z+?59J#(b?E}y61
z?iW>}NdgM~%%bQwpPw2S7UKK+8)j)aXDOaDPWA>2Wwprqe`MJf^%KzV38><eS6D?N
zMf7sW+7{Tydx_nVst^B$4ExtoH-iXYyW;E&E*Qm}iUweTn?GqVAbUDg)ZZ~=_gA^X
z7HY3hQ8CO}mSFt&Ti3{U(k`W<zqtZxG`8HcF~Idl(|}d&r{f7o!WrNvKOX_Dg6`lf
zG2wm(-w5Qx80)~<?=e@%;t;)O3L!h~!x7}>N->C^-(vCIet+Cl!s`OZON1=j$WZ<n
zR8Ra1H`kgVVs)i_b2Iiz@UrWsc9!Cy_mUxSgchjk*j@hox!#Qw?D%8s_n^np0zs2$
zJiIvT+O(7b;0s~CKHIdpGR*J5FugzES5pF)($@Pa-D}3>N2!=F|FGqW4Z@gu%<wLp
zq?JN<3?@Epkl-F6l+R!W6Fg+J*j@zlmFYF?Hj(pF7Wvz5a0c2>Y4815iHT>@mENJ*
z*_r#L`b*q1g8wo8n?-r!65qf*ac2s1bGDvDpx4&hot^kiZTlNDE32#Wh~>Rkx~}8p
z)`ydh5%T_SMYVn;5;3Malvn#t0w=|TObz$&vFPf2z3GUT`;HcNuPwvmY|{{W$oP<>
z>S!!03eU35Tpr@GqQx0NF=^Mf@<d1-s-Adt6seRdY+-749W#H8)M0Ui-QPlfn^9y^
z@`(hz2MF{#oyi_dE1hjeWOjr7?cn)Aw8YUqP!iKloA<OYKqKaWI9$F_&DF$T{H3QS
z^_t0zLUxZ`cvzUO8yrIY&38CO%GKDb{1HbezLirxpbiMvq3zM@<~;$iRD`Th6CN2u
zm$x4$t1kl;(Qk4NOXdA93cAbnP+}^N?;f}4AL6Xtfy;Gg!CV?=L$0PpcofaGB-MZW
z)$v(xm%LNox(VXfVEt_YySDk7;9ZZ1{rS(S%Zb259%3RgC?yR@L;2T3*v(yL0x@Xv
z1m;JU^>#K-H~WVHOF{F3AA`7<3+IuC3!;TuJLp-3DL&dG)JRoCT}xkLxcTHzJk~tq
z&!cPOCiSw%_SjmI582t;-66+%l)(DP<mnm<b`&1rw@ph&cX)`8C{Tb`>F#*(=N394
zv$Oe?4!3+ivA>S+o9p~(0Kw~J-`d^i7*@yebM6837hav!``?VgM&v9RZ4BGDDqD2&
z7&|2I?mH_pqVP2}x28UQdRGgtp#^m(#<MmU*7Tq~cQ|Cd($*Xu(Df5yc{Qb71jq0#
z2>C6rl`#MPzGqpvhxakoJZE$s2yo(o{5b@`K0acbDRpqa8-p(?O(7DqI`;Lhgw9^T
z9Bc&LgvC%wDtLO#Of|0e(?hyjmE5tUck5)j;50*C5O(3wR>cH8L;;6kxhc&dMJz$#
zESsIrre0AAJ~oWxlDs5xmU5`3ewRaDT1|p8yp3^`{q-NTIErLNk_qNlSd}kgi!+#I
zlJCy?#Ac86Q81E&87=S;e<$A|+&=i>S>-k(o#*(4W`;)Or!@SC^aHxgTwLtnQl7Qo
z20Et5lgmk&xP(>!vLxxW%8Jcb6Xp|vY6jzi4~g-J=o;k{WmozFo(1=V21E8}W=4Vx
zB^z={Lpj&Wxq`MM@@zAV>x{-Ni9^eZ>oo_VF=z*=nP;NrG^xyD^+eEue0mgsK)~RD
zxo=84d~a(hn6_j4>eafeow_B`w&zPm2u)3Lv4_;Z(bVR*cr&VXV`f&bI-@#A$8~nM
zrt$tc67-)z4T!Q8Z0Ab^9YlLB?@|Dw=+ZeCS%qzDz;qzwd#)a1$fX#0py+D$mD!*S
zNO%WJX@2qK7JpFmJVh+ZBZiX|F8H?|IIW3E&KMNKZ|sBPJ@>jtgdLnVLIGw}l?-i<
zWm}T4KKA^a>4PSlfffL?4(C>hj%^KFy-bEQgscG|_ecRQ(34$M2yV1%C|APyUzU5k
zHcMGCvz|?xtVm1@-&)<+u)jGeR7ayQv-=XE-tUf;{uVHUv1$!YcKaL)BpkYbz$?H?
z*H>18k}P&|BF+y6IDy_DoqEUx!LoP^iDo@wk{WPq3+;CCOG*YjbqXtU!^$#ZX@CSc
zyqXIj%W;aN@Zo7CLEZd2h6A$Kf;L1TsUjDw<FwqDH~TFvht12Ea7}LJAb7LR5IG&B
z=Yn=|)jx(qRqASWGxjkg&tZiFN=}&<KT9k!0m*cOiJa@r!$o#ZyMfrcbi86BCwnY#
zD@ws;?|ID8UU%Y0DO!f%fHbUL0WbUAyG%+4vjLbx+`R>u=_WTo5&q(9KyX)1(kjgW
zy7@RQi*$6P<{PpHhr0eG;6s0~%lqF_#XQjvow0EMW!KMy?+&CB;piIp%)!zj&{=@v
zFNr897Sx&uoUcclRjT9A&Rt)vOk<!Gb7M1NtJf22+e%^mD>jvdvYLzX2VJO@*jxC7
zR*qT*i2`7*N;UrrYri};3%tQ{dcLS5FMI)<jepK16(Og6s}fA}52*n^DbO5olHLF)
z`?aMJ<A)Xzcgkhz>J)GPI6Z6@n+$8~@Qh!My;V8L4h(UUD}ZjEk|WMQbT1`H@kO<k
z|12tNe0-yuvAIoLusPQqRi_Ejemh;0*Ba7lDM(uW($#c(du+(hcS)YTzE<&bGp|s?
zRet`Eb&G%tli{A<?F>I=ie3DV{E*XGV`Gcut#$p2gto0u7It=zCuVUqo<P;?hs)NX
zwu^m*HJ4Ukc<gxL=TWPsf-9_~Don&~CW-DyP=EtCv*9@8IZ4=l{P=9dug@_+L)LDu
zJsrqDzyT<}q(%6zyZv0Aa1%{yYWk(|KZx~I;{QUdbzoi)D0HXPBH+p-7fWCwOw#fE
znU%;B#`2pu+uFSVK=Hy>E51rE30I3YUX!niN+_LGT8u4A^9|aY`pKVtI0dn`afr&D
zk5T-tg<A)}%GvfZG(6N#1as-f*WS24oBmvp_~JJxm8gWFm#7w}2B=^`CLUOEk@~?#
zz<GzC>x#-(xci4~^Tx6Yt}S`>gU-{@;ij>}C^nWoDmh{QS@_iMl1Q;K5(H2m+FxKx
zl$K`GR<pnRkBAs*=a|2_SK%02mo(ECfFteyqFnR8Jo&B08HWVzJC(eP=wrGk7Gtp;
zpYHCUG2e+=#m`KCDTu!(o1P%QsO6oP@$N@X1jmVZbQH?af)sq`Whon>_jb>M<MIsI
zF!wVhk}(9?u(V{<*<%j47|Neebaj=A3$i1dmA%(7F`tvZMa+M^qYSJG{$etu&ulB(
zANsV^4<fyXGBxZdogbc9R~(5~_J#mZl)h{L`S_3d9AQ(b0(d&b!Wjam1~U)?Dv4}{
z5B}CLS1J;Ibp}90s{flPpFU>N>A*2z3PB*9aQr)+JjW~lJpwBsO9u-y*9JP=_#O(q
z+&~}@9CAKjvKTL+vStKGWyg!)SuCXnE&1$eiXL%bP~bxmU81I#!hG}h$_^Zgr4ZCu
zD50b{{%g?w>!%IKlPeV5F+hnl_wkDQ|LaPD#0*tT$Ek<hG$I)-4vF3KXmytt8em}O
zi#nRexAsTH2lno6)5q5@Kj~7fgbC1B92JSR$}pJBfBAB$zsm5<i<Uc$mf6v}X|-%}
z50xJrm>C<Vti=A-hMJR*75O~44MGjo)7w7`ugBuHgM0nq#*@NN)$(9My*Y4nC_=B_
zm6kF}F=u$uM=lHMGet0i9|1QH<7HxE0-h>}rwiZ9!4h*nywRIpDE_7CUeG6qk>oL`
z@OfHf*%|xE2ZUtn#d;6sH8QEV6yT99ECjCJqrR=SZsBnaDA&0&J*3{dyqGe$1^310
z*8N_x!od4(41qV!@RE|<UjGbWI+69I6X1&H1efF&@~gURsiIXE7-dNuP(}7QHJ+Pm
zk!#?boWt;=g#|FF0ZKRGBq_<P$r^+#xLH1~?P4Hs$*1wwPI81O3s|bj@`j<j+yhv!
zOouYz^@peVCafto(Wh?r-0tHsqQ{KmvwZgYI&4u5s~&_W=NP@A&9BaUHblXU`A`)M
z((lwFks1Qx7NGuE(^t=UrGgK*4D-{zm>QdzN->Z}YCzxNaJR-%OP&`ms%>Wn3Nz(g
z)!scy8$p1-)BJkXVnpxRpk*mY|7{d6bfT8>3b4*oGVBcyX6}D$QD0;JRyFtHQ#B&z
z?I8~8Xb!sW&Q3KuyL`hpIyl~><Yx&9THvInrY8A>ikJDZI(*4koK+LTIWyHnK#FXn
z-zQ%Bz`pR*E3Ny|=~>3zJV8<V+&Z}*yuhK;uFp#@`PKS8194<itGCnh?F87=-F~Rk
zV@#z}uLDI_8<a6tMH_%-)BMdl{O{#MKWKF%Tx3jRwcXJ}C9O`R>uVO>K%a-Qm9fLx
zwNMR0wX45fNQ;_~b%3;NW2BA-#Oa<lYYFs@^mSZrK?Ch~TqHAAS66{PgnNhK`5y(6
zdx8HLgjHR@ITM2p)mYq^OKmS5XJ0+r4~}!))RR>iKKlh7HkIQ~in~sE8%jZAPN%%Q
zYj*EVO7SV<qk)Zdc^h@N^jBYJePsve;x)34q#lE8ur<>rimgRu5B(&rdTu1BKcc43
zzhAER_2Rx}n9Egkd{@E0ef{!pLG46Y)OZYZb>sEZz6QqL7Y!sX={{96w6H)2zR?#z
z;HGMei-k~Hs5LMrp)C{odeFhu%}pm%N0)0%7O!^ak87nDb*X_;uKrU7x^ww~?Z^$a
zcd%Av&;{+NGVbI(kp~Z$1l5xx1^XWbshH``*|}6%gKTyz`e2Kr;(imXv1i(;Qf&Lr
zv<o5qJ}uSjjmAnnHTm;IMEytr)-Fi1oabX1V>wUhvyrDBXJ~OD`UNK^=e-6m8{lX(
zPxl=jBtTeI3k`t>b9`3$UAC-Ou2@&x|3grH=)N@CT^jqTP@9oernCIx&u!rCJ>;1r
z34u%0RZ5qrH+M~7FfaVFhs(3z#04MVh&5&gA2ck)q_rd;r7!5CW|`vMZyN;RF%fK6
zD+{8{h|Ul)W<^jkv=oQ(*rz9Zqdha21=&pb(%GU>D*Fg#7qG(|YcB#)djn1@+6z^N
zvm%?;9#5MtF*^5+3bx!oQR)H1!%+T+iuw8Zr#fm?lWxW(wT+D=kCcnq+1XrxURCg?
zFrn7@_YT!TKu?;=O0oA|{2X*>9X=p%ztII5t(qY<7Jx1Mf~Jg&i~<))mh%gv#xU4M
z`2!t3A}Le|6^mWj3<PEr0y$mJzsR<dtuL^ExY=9y%GHD>T6N!X=^b2oRiqq#P0ekf
zf;)xYxs$QXLRALcQ!KXQe8;p`7&fMBaiAkxc;|KHvxeK^>zNlD(&4UK8m-?}dR=bw
zQiCPP*Lv9${wPP)$~dFj$CSZoeIao5QhlRWGFApLC70|8urh`dWtFLpquI38s(jR?
zYc4PjeI`mJ>b<^+2|rSUL6OM8gM5O;`QOaCnFW-VGM+o55g!n06AIX-02UV3sxMG8
z=|&fm-`JR*esB8m0h8R_gR@b2E?_JG_H2Vn4$EH~yrWti2+VuVbZ?w|e3CGL!s(Mw
zV!x=8!Q<(c0>3oG`zO8M81ga1uF1G>N1o+T(dh)idp+i9?c$Xc?of0d9kEcEw@bU{
z-?&Bp^4fPDaYcf4q>1SVjoyfXwN<6aCh>W{{M#?Jia%)fzL2|r-Lb1yHU3>rr$9xg
zYi2sllFX=#tC=rYJZrh8RL<oRO*@;zTwmbZ21=UE6AMrbP?4bIG*%3VIc{_{kf?82
zU1N?5&3%T&(NRVnJiJZ>k{M-GZe5*pI1FBo4O6rR>SH23qgrSP)WE~&Xw}>q{P@pp
zIJ(T2<Ea`SA76o?GmvEFqzFXLCN=s=2$3@j+AYF_#upfarsokinW&>D-K?tMRIO?<
z5pQ!g&}Z0?I#+$I>c`_&ZI47M7#>2_7$(xFkv!1&^_TpXT}7a~Uu`=m9d0E}9Ps))
ze_-P`p!zFcSqSl>(bNJw$;Mx3#D`Jup&o4)a(%}PCyPzS+ndUPO5$O|LB(fWx6X!f
zrF+__-y6y3vlLadv1e^0P$#MhH4(ie{_XLPjy;|$(b3j6lFl!Ec(lZ%q@;wDjLJ*X
zd1X=WfwXgRb)6Tv4(KP$Ow(U=kSkVMX4$Ap_rwSk1MSbN&U9+9lVlrpky3K=X`U|l
z#H*bGwgRQ%M=BhGgKu6uCxFuO5h6e5sWQ5e+70#6&b@7mK)`bKWjPX3Qkd7pza7*n
zwyO!Npmj>p2qdt7rKLceV(J50fhdRMLaP>TX#D)=dY4s~{>)SBn|H@q^D_(tSy+Af
zxjwksal}7MDG@VetxFs5v)^PMNO=2EAoC)XdfWFGYWg=E?Tu8F0z`w*Rt{B32E`zW
zpRA;`H_m+A_I+K?WH;G7b<x)@!!)PozBo8+R;rGf%m$=q*3QZ6)t)}FjerWUC)s`U
Y2VIcY?C9oR$nix@b*NgI>ix)n0dH{C0ssI2

literal 0
HcmV?d00001

diff --git a/example/ck_tile/15_fused_moe/misc/moe-3.png b/example/ck_tile/15_fused_moe/misc/moe-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..77c6d9b6e43ea2c2ef9087eadff6028b6af3f113
GIT binary patch
literal 18655
zcmeIZc{tSV|2I6PlFF7O$r7O`E?br?St84jJ%l9tnz3XVOZEztgt0YbWCjyr>{Akj
zVHo?GWb7u)Scc*LP}lYQey{KSJMQB?j^{Y;=eeKjFEiuwd7tO`KHukid%e#1Tl!jz
z$Ic!Dfk2GfH#LkvAi9Sj(81?N4+6i$|B*Qj0^LMtYp9w$99kSbo~iG*^I<tw^Xc%N
zv=K?Q`%xx0PiZ^XR-LlG8Suv41!Lqvv$DEWXlBO1<&pa)*Q`%4{)QUI$y`ItrS{K$
zG1|L1n-y67&ee9Q&h)^J%<16l&aK|Ic}1rV`OZL|uLY3JEJ^J4$`fW_@VNJqQ9{7q
z%<Im;Tt2Ywe>tc6&zJu#9Z0mwxK6E!HNBtAFdw$e#vcNK?y<m@_@xhk!rfa|!$Kin
z&8_^M=E6K{$=eauVtAEq_-4O>r++Z7eTS_$^pn<}!mE9fxWoK*rqLZ+imP~5snrM`
zZl1Y8LjyLL=lMNRcX^BM?0)0ob>QuwWEePW(y6flhEw5eO{IwINr`usSWVWfT{+ux
z(`;87X*$|2EIq2WC-`~8NEvNtdQ34hKo8uKW4RP<g<gQ#t-jVh`CIbMdd-GX_enFE
zuGg9?wajC_K^0MruP*$4KXC8;6=C7oSwVx*-Z%O>anTH53=0edx}J?mEva6wVD8n(
z@yFTn;HAioQ3$J}U=WC}^DXnMr?86K;P-AuUVZQQkG;VA9)~l4KtCklF5Jy;|M>Gk
zX85Ip`@f%;zQMP*PQ@t1(LF<a7K=X>v8T)x78w8ja6D?phxQizMcw)FzQqO}1+(st
z_K%Q%bn=g#{D0%@WKTO@3Ehblbp8G@o11r`>->K?jem~uKgan0(qnw@=Jy|z2r0sg
z*08>E_;@?KEudk?`W0-3z-M=_MWK(kJ~>;;PMt_CsuS207HLlL$U3Pq*9c2sYptbC
zgb%3ghrGD+6H!D&W*xR<OiFn2b?7Vy3@K?EiEMiBsX9e07@a4;uDkH>2hyg;%UqJN
z#RK_5o=T$HpYzjmeCICp%jn?=N<>fViuJb@0yMSNMvGalvXvf<7Rwa<CrMVRDJWZ;
z=avqZ(>0!98hkOoBJaIu2HbNGHzpljrkbbnKMECmmODN*gyVaTzt@T1-9S<s9(~I3
z7&BOB_u1hNhx`i4rHxUJYaZa^R*1Ei3B|_Ww`6qM71qja2F}79kN2f3C^3LvzhvEy
z?Dsy~qx_SsG2`aX;&~m;DZU8f&sV!cc(bDFHN`OWGUD{2+=$zlS<ciM;qY$NUy5L-
zJMEQIDk$t7>N@2^@&{AgaDS*1?v!nQslN2d5>Z8CjX3jLXY4}KINWt{^l;9DnJV$y
z;21U-pn~512mVE-vypA{)wIpO^lMFQ3i!tU>PC!E3PaVC{RD)CaI7{(Si{+~^=qv|
zmx1Fm0TDLu!~0napV<sTulSCqM~-Gt$F*5`x>Wb|!?h^|@9-umgTbLO$`kqy2gUx}
zkJD0DGp^U=+N3Ph<rH{!^g&{zQEMjiXZxZfV-OXRfN6{enZ58!(XS9)_U3!t^IQ2h
z@C91bWAcA+N=qNF^|C%L!K4@-C-7^JpWV2|lw?!u1vMKR8ws)f=t5p{GND|1Rmcjv
z{_@u||L22(s@|P<l3PCwZ9k6Nd6JHY^ZhcApP0buhP9+b0-;Uwgr^CrW;pneM%IBR
zn9hM0Y;s}uhhdT3FNz^or+!$=JIw>jlgWZw5k4%2G)&)YhEKXGRQObfCY+3|&|`3-
z|1DCMG!LfP!b%DXMgpLiR|%tI0(^5(oUlpR`^vi%W$SdnY8NB|=D%mf(kk%<s*O#F
zevd~ESim>YJCj$%$DD1Ng-sO8KUsKqE#5-=(P8`xeSy^To7YGkyFp$3bpvKByzQH(
z$G+S<a-vXmGaJDE)}>L7S>uKN`OOSd(8KYYD>XFgt$aI1aK(pr?6o}_I?+|SuI37v
zVc!Ydu}MF2Zb8g!2)Dd0M71Dr8Sn1q!#0K!tVwf0sx#4fSELg>lYFrcy`3;?$WwcR
zqY1m<4gX<JC8+=C*(W<!@rz0aa}$|x@87KCP~6Y>9)Nr?{`{6|%26OAjXTJu&{JJv
z_Xsvq>ph(LG_v9D^f;Yg=)Lc<oWj>Aw(B<Qv5?MV_1-U|9P<*vp6Qae>qlx_+cGDl
zFle0Irczx%PHd9fNf_woHImpaaVfsMM`KXZLI2b?W+~->QZUACX)W|X*Ns@mIKl|^
zdz<dO3D0QBE1QA!E&Pd0m+tZ}_>eW4f$jEsSg%IP#tK)<cJ-Z+OYrrpLu$rf+Xr^P
z-v~2i^Pt-H&}Nl`h(pZSJ6G+VGvR*(^;)fa1ne++hzgln%J>b^C)F7j+?e||7{*jH
zm0`<k(x4L^-{0_txa~5#dxd-VYwIQ5-uFQ*sGJ5d`3lQ{;Ino>)v!%0pvf=tatDwH
zzN(9Jca~r`hIxB5whO0TC7cNfsyKvMXNu!b7#n&oD~mPN*`C;DtKk4n-nwF*E8Z&b
zQH&!W<SRnHnX{*D*Ft$aD>{gpt@1wjdrhu^l9rt_EM4T{E_uAAw~cvMQoSFZlGs*^
zs+@@n5t$Uk9?tgM`O=jsKsQ!v0G@aF&L)DCcRYZVz2dMpG1tkpdQ3vr^9DC8Ouxv)
zZmL{;NKx_XmUl_e;5)~Puj*9kLbg8KyX%*4X4WasY@f#=m3nJp#@oHs0JoNISv@kO
zobX6&wWF+JayqP}GffmK?kP`QLwy<?o;iU#+=;VMT>CNgO@tS$Q#l*f<-mLgF7yg~
z=S^79=zx{JmxTfE&WRJa79-N?@T2a@^m)s5H*``Y^;qJA8oyh~c#UOfwZj&IdrNZ7
zp|)ON%p9S|i3#{P`O0IvRNtqUvkfZ&5n^hn)XG2rg{aEikzhwhpz6ao@i+&VDPh+8
zCpS16PW2SO!dLlIR?nV-LIUi3onaXfCR+P}?(G@#HNp~w<~emqQhHpwld~pd`uR4S
zE2V4E)q>f{kw?*okTy%7_+zPE;o(o`?DQ~G)O+o9G84orPfB|Cn&(8FlQNn8ey(yR
zXU?r<@o`1_(+`)w3rk^^+(IeU{n!u`lTON0cBILf2OW8?)Gir;9<-|^+Chyla`!&5
z>LKSvc>MQU@tZ%MMz>w{^b6Js|AI2v=>Zgfr+cA)_oBFEYUtKL0P{3ABp4h1#clk}
z*d$aT^@411dpzlb=w)@l<XcTNah8h6yJrLh3XQg{M~BA3C^sfZR=zO^N}OXHWIl*k
zg|ZoFxT-E_-_fxU5VPH8?)^pzZkDT~U4Q+q*-047EWyvplj~X}Hef<qb$4uPx9FSQ
z+-{K&<Lqdm#itiuB2C4L=haqAO>2cc+C_>ap&!+xN{zPf8bl#HwUmJ4)k6^B>~E)j
z-xB0Xa^&q2cz}G_gJENZ{e1L6D*l=zbER8jx`xxTaLK7RW(FsqLWZ`YWgcOPS;e_>
z8ysUUZP?+nWXtD2F?V4r&mqR`qbjMm9XrZcGP5UR-LKFk??3t#m3thex7b=BJbmu#
zaC+dkyCnhoStLs=bYZG<b0<J)iX3dSOB}aN=Zchu<|QrNgl5+d3yh}Ltvpy=n&!qS
z7rinUr?fwz4-k?*Z?jaoeL9SKZCw)-PQNiqt6cjGXV<^NL97wLj%~G@2SX3r7u$w8
zs*ScznUUm?1Y_J_NZ#(J_mB1hqv<__AkE*xsk^Y?P=bEv$cgu}$rP!jj9h+6W%zfR
zE)Ocr#APwoc8={k(eZUsPEMYeT7UXm(t@ECK9X~ar`5Er-{pA?)FUQnaHAU$CDg>F
z6g(TZetPrF;$e{DM}_SEyng?)K`9BzU8*@e76TDqU&Ad1pz{rnp1(N8ML#AS_)q|w
zj+7*&DZg<f*c!Z<k(v!{OEbJ$ESr)s98T69eI6xr|62AgWOYl+iAO<Rnq$(OE?hQE
z_|ygSk`3H=>}mOx>Ruqp5g!S?$?-;~z3<Ypx|IpZq}J%$i|34Bhac^q#X}yhG=eSY
zDl=03EZ<}gP86-tDVXX}7v~jT_-#DFm>0n+6(RG=^?F+oS(d&k2D?2cetI%Jf&I~I
z&%+9P=CLA-iUn^bB?+$_2*;#3l_W0g#tw^lOsy?N>Gla+v_B1x_g32J+g@H(F=zYa
zH6vMNWrYc#2eEKY5{n;xUrdf%I58LWF<I8%FF2r)GVp$r?R4h0)vTASe?4mcFq}hZ
zuOY;Lho*_Gzl(E}k(g?judQ&u^L_CED7Uc=ElUY=se#S-=;sxxdwByQWac6T^Bk|3
zu`!cpe%|U#{7ZrdZ)jlKlAcVdd($_~3S@2lHBk7MfjWromR8H=u^6_^h4k9du`KWX
z?C@y63r49>*dH8JbeTDh4#eE``PJ=9q1RKddXjVpw)C8J{-qT(@Ot<4K~P<*Cree_
zUh)>j2)N#dT=4iW{L*%jJCQJX`-tId2SIV~ekfaqL<#XH{C<QT&LGSJd$-qbLi0;c
z0BxqLd!g4fN(ktUHXZ@GPIZs*QEW%S1HfqE=BzMpplQVg^qzi-sIlquN!!r@JuI6W
z&W#LFLhe8t>jUa2Snkj9{}KL=TK;Y=@o_g72l%$Gul;b^4fMdcjf`;&NKq*gyX)Py
zxVDZ(v8>INT2Im`{>Xi^EByB~*ch9`bi0nWs-z{!#BEL}OlhOIy_73*c<dMP+?JF@
znT%kx2n>oiu8)!{wa?-oe~CZk+Fgs@4enZqZc{NNQ0DtcBb9`DS!Vf3vQ*>vqt+*%
zQLlX6N9|^c*I#)%jb*IwcrcHr-<FZEUp}29c!wz@#RXdXA)Q~N*L?Ai8sXEvd7pCG
zj%lC9junU!I9T?ID(kw{cjF?^vlr4{o&)0h{J6vB6?~~#KC>8_g9>E;Kahk-29J(T
zqbRnUILt7;X(!kWyR+DUeN`8#ur^<`*5ckO1?P;mt;h=+C0J3U;j@DKNAOtQ$<-;p
zEec%c(^;5ngq^5=Vf@Qu4^n$deC2kV3a&BnepP`Ji|P<|lTj<Kxj@P3st+XQNm&7w
zIw2!76<fY(`ZKqtYhoDAk7eOcY4wMr`R;fw`5F7Dtt6??Ha=r)#kTHkK1ovM-SOu)
zRmp)?<PI;Izm2ch-7(^wUe$Bf95%OB$VtVMPwpT1XwG*Jx1L^+Z+NsiI%jU3*@u@t
zp60snE9rK<_p5r@OLMC3*9Ha7TWIGE_pVqBnTWG|A7ubr==Dawebt`XZgE_ByTAPs
z_12wxy-j1IB@#}-V8z2QS(lFJ>B?RL@@;dS#M)>e$%*$%i7jTwqkkk<SJb;$CE?+>
z94VpE!D)Yj%><AcpaXU#ptN-^n@mMZY}?z|?8P8M4y*Czs9Co03|Xn4_OJkx!8Z+}
zGH~SzOU=dUUB*DY8FTA8()k&4+T71n9Mc(9bz|NVGgQ&$&*M_Yi;yR%PGhz^DO^*(
zc3RUmB+w<R70D`|nNlKN_CCh(n_hBujezXbkBUH*Q7z}@aBKo}ZnNLUgiR%s3bniP
zJ>Y8=Ld~x0<zd$p?>Q|8jo8psB!B4+c?2;nmR*DQVrM(o=W|R&+LV2>{)yi;`0R0Y
znj?}?ES(xp`G6c+T!<W0ao_rQnaz>w2B7?fy}1_KgwiHlv7PMF(zvx$ux)qU1F3m>
z<{DNF_Wf-+eEcjdqFpe9c&5NcB42HU;NkkFsYR*ACpE}IqGSQ+kjz8oqt8n_EbAU{
zYfVc}9)r>ftW_v;L&i=KLGt8TmCb`^#hNxphItw$>jR}sm$u!0xQUn%#jWvMN&Q=o
z1Yz;=ZfBx-VXawE&#;ATEevVw!BqGXGOA>;&uyg~&AzF0pLoA=VNB<Wmwz|;nS7<|
zm>~sE?jm%!;x`ht=wrfM<%SyO?u9W&*!peFUpoYS@*3_k_C4a-&V8mMIRsir?^2(9
zoc=GnKdtW!%S!<cc=00PW@m`7(sZHt`fYKtNs4@pbupg9nR*oL(pZ>tT|q{3ck1(S
z@c5jp)VolV+nEo(rZ+rFE+%SJhUR@Zc0vJZxB+(=eJ%g!OvjB(Rp#>9<c$bfY{mis
zvfXh|cN6)9rZf|PS%TPY84t+7f+C5*TaE;UEF=+mN8`#b<%&v5o8DrwZdkp!!+=c;
zaqnHy<|-$1t2MJ1nj-v4xTxHKX+J`QSD)!>H=!*PV+q?G!_?>-b`^Fd;1j3FZ;%XU
zv9#rAR#;yE#t(B#&dacII4O4yCH8flWn;--o$xZh!(udRMCt+%UV(u8D4Ge^C<6J>
zADpx4ibmg6%roTZbI-gr_SFAA_rXgS)r?O=5xpFjFSpxn*U%m8Pk2b%NN~NM_R0Oc
zza&*%3fVCCB7|9x8N3=?+?gUJhvoaNReoulMB4|*_g}eho^qSBmQEh#>LLzjP{@MW
zY;Hb3_8kB6vvei}9q2|&tA&sc2U0Z?UfUVo_DH*OyL0-;v2(o_e`-ZcL)4=XC7FM8
zx@Dlw7n4c|cz1XdoKwlxj{zGS7ljBFP?Ee(QnBGBda>Qn+RG0RccY^b$oA6twXI{-
zE8Rsh<yQ4T%V#z{=svh&J8=iBRwn?zr0b~fSg>OqI<Te=cm}KrwT_9;?NfSrrh<S;
z>j~biH_DRJ9H*&OFD>UQSlPBc&-KNkJ3NAx%9$0d2b8`=<elY2<g_=N2HB{*M+aSl
zEa!T1L!<PdZ)F`D3iJMG|0smBO3S#b=M&8%YxCXQx{-9AAMPS1Y#LwKkUYc7cIxb}
z4LHmQc5^^B>FH7z#a7-J8yG^`<9%q(txYQh?KeBSTI8(qo5Dg&X)L6h&X4n`n&Yn*
zWyi@84FYoBH?VR?6&Kw+VV4%J6h*EaN^~XHu2aXel+5}v2)JT==<)Ut2hFrkQ>WHW
zJ02J0<ySrnY*!fbjTlF}Pk(H(7f_77xt^W+y<^hSY{F=9_dxb6py^R{Nf^DpeFE!r
zeDheNuj_OT@r={0D1=5(94T7#^lWP(;41^cR!nuF;Vv9X%GGE-xuyC|9w_FqEe$xG
ziCHVH!K+R%ld|T7p)WrIZ2R-#GEBEU$C;EyCWBYUcvqX-f-bCnc<_aGB)}wNt~Wn+
zG@?}<dqX4p#t*8w^$O{{5!|ID?r6{%uCDm~0R8bX2h3Hig7_A_4sDo@>%alg37p)@
z>|v<n>j0e5Q%{s%n7<fSeWRGd8pAo6{3-nr%b!W)fa$<n%}N$nSsKb+KJyX)lm5y@
zqePE_FBJ**d01;aaR$=RU%BPPF*W03+^~34w%~x;=BM8eUHT{0{J$&%Z;B>D%1^88
zemp6&e77@^d2NOz$WD!L<^Kh>#f-{Z&|ZneYVmtA!$-jyr~iKo;77eZ3Z@M+V+N>;
zq<-}Hdl$mU*wpP)0mMyD(!ucCcCyFs>;o2!V_sqDk`5zd<7aL-AG|07l%9f*)m9&G
z$?Ka95x48n=-cqk(Fip6P<}bG(${k<(Q{;WF$X%vJ=dT21R?i+^`c%#!OmQN^pn)8
zdV$h>NK5jlcgX5POuz-=OE>8+TF(4FM{#MI&c}w7;j^dJ2<-a=3t($F2}i-I!~1NI
znn!yijJ;ym{w(~G^Va?&<}7<O5MVzY^#r63d*tjrGJ9;_+W4Q~pSnaLUN7!*P^?v*
zYohTQ_gP^Q3HVd*p?i-Y02c*tLE?%|MiVy})1|s0W+WvfLUbR$8b=#`_hV#h_sQ^C
z;n{ZY2e7UB>diR^)XR+sm#=+?rS~QLO**@ZG_O@f@InnOROMDRkmOv+dV!P-?(F;`
z=R4w+{tJhy>^Q&IjQHDH&6d^6fy`fP#o%pOUA?x^uWpipy>24<+q<b2^Imf{05$4|
zbiZho)kVIWcHRMSfZn#A)~UWVm;0lcRrquLq^Cm+1;}_Fw7XBRtxnW`tn*$BkxVM5
z7aJ@=P^E9C4U{Hk2Tv4{fpSYkDS+&Ca>K|)!oFPjLps#Tc%G<pMS9og#^O~c6S0K?
z(J)0aIg@+B9U!(0xkd>+j==jK-rFm-=j>lIB7C)Abg$x*`?roUIfVikq0RMzf+YOY
zv;U!gfF=VJ7GD7{P2SH_sd=`b%PM1yNVH0N;98QYAg+D+BuvdJ0k0v!0xRL$@7a_A
z{28L0!!H;D=qcg^GtA$ZrT0~rP_zus!05!4i#=$qy&^4zvys&*q%ZP2+uK>eOgzjr
zOD?9Z3H7Bmn^k3jo5IZ@Wx=!S^ptz%6*m`Nyz<tpSP2r~^5@Rl7_SfIve=x2^R7)<
z_gzrM_6O8LioJeFs~O|h_Sr&z&4d%y@Tf1O2AuHWz|;0Cb3kTWKP*%6KrL|I0Kh(S
zYOhj?&a!ViBICB_u^9AtV2_uzX?o`x5+(J3I!7az?F0!o==6$a2va@o@4~lZojeKL
zg|fNC89&Be7;}B9p@Qwf|FCV=T1^O0s!pKRXSVRPj;FCDdbTg45eym-@8z8~SDWrC
zWao?UK0k0uC>gvKwY${C<zgwilWyZj5kZTOqgHy{o9-~UUd^?Hbg$HB_)j+s2v_Jt
z8;9gS)EqN&CZ7AM<b6R_M)0d}WiJrx>Ha!EqAz6WDArXT`psX<oBN(i;@TTU2pkS@
z`B~x-K;PW$v`vKetRqpkzn|drfzs$`ZJIz`;>Uh^CL~I)xA`%XzTax>ImG>E4=+x$
zh+YR*uhYc|^Afj)klc8Ac7nQ|qd6EnWYpL`Uw8iVbc;ATk8RExP#?9WkIwlat+hKD
zPvCeU7+9$k(9;FhO{5QyHN(Qr^=zNS7oj#y3p-CDBDU2CR82!3SgNwb1MB7zx}3U{
z)KXPv6f5lFk-xl&nS}c4AbL%4wb#a0RdvnnW!mkXmmaZ+219B223cW4^$o5%#_5;X
zzc1<O8nvZSdJVwY)C;l1q^(uoTb7*}TL*P|bex@crrSuKn=RM;P{N@yM7V9@)Um&>
zVW$u>uXNo^9cCJg^h@{!Wt$FAL)H0im^wEE27eM^*I&MNv#>L0dMc;NvvIx8573vY
z?d{nB?B;k)PH4w0I~1$p$Ky-?Cdv{0-0Sji6$PNw#ZYRR{=P{ljeh67swR0cww=%p
zB(&=;X|D}nZH!>!;0Rf|!(%OZef2Z#BFz59Q=)PaI|ZDa3ZiJ&=W^e22_c7L6`ew!
z7t)#F7VvN9Uf}2*(YUqvV>kz{umu?(9ktKLP+ByJQlmNp_rK}#g4Yp|%D2bB^WHbi
zA_Umd{lI+4qB<|CS$zmz;>F8&-SxQRxM16@pNsi!JTqMr@N90*u`k7ccgCtn{R(3E
z6XR~z=J}5?Cif7eHpX*qqctTIW#83vZ4Dks>yfvQvvX3z?m2ZVdj*`6N%hfTz<F$y
zi9_$@!Zz-2gxB=40f)5GV9&q*<&tXQ47Ly9g}WEVy}RV?mu8miKlrcz`!hG3o8R-R
zOHJSOMA7fULur60$3hus6e9RP;*I5VK-8JtiMsL~O3OU?oEqUj79w%Qckd+CljGxu
z*Bm;}^<kmOQe?gV=E*rH{Yoz*FXFrLo%;Zh@jv4hT-{g&;?vGO>Qyy!PAVDi>-Hb<
z?l>!qn4X+-g7?|2Wy}H9|J1<^I0NPoZBe!}Gvuw4)_e#z>>~7mQ#NT$3>3!>#Bd-S
zJ{u36ml7_Hl}ZVvQNbC_rA|OA45Y&dkG~oJ7_afZR2kB}2t5GeJG#$a+}X~SpIuu5
zfmjahQwYLBJ2MY~_>S%Ox;MC2+qLKU7C|5luKlO6Y#Z&`AW$yL-m~%JhoHm@AdumH
zYrDydEG3+g-?x2#1B6*W3<|%skAZIs%}5C!0L7{8vnWGN2mae{`j)As^WDNiNB&i)
zwj$&ti0}CR)UxFn{v=^WzNZ8I*e7V(XN@}P>;DPp^SWc!m0~hCrcz-Tm3*N!z0eV0
zt>JI4?~yc`6>4Lp#6y0hkgd0Ro61-txq6XHYfCMUao5%>!hADf&bI^tKbNmp0sa4P
z%6sdTLa<M!C;ts#SXdK|V_$6;>)Fkpb<J&)Pe3r$e_R|r09xPInMCm!V0X5QH}_Cc
zp_p;JgEywqGwjiA3D2>|!echCg^u{ufBVW+Tk|i&%&HD~C_#LFLnbh}TF-<VVO%qN
zBW3M)HDZSSQSr}dyWYe3#Uo?fAaCA1g>cqBUT^<8xv3(H4f7?!1zf3r0^-v@8v;_p
zAnQ#QuKd+LpS^aJ-vgMpCy5nQm9z&#JhlEDO8hZ1sH;_nxe0E{vGg2_`&eME>%Eot
zaL`~2GNhugG*dbWfR=9880+WwG3!S`xj+;H#>~@MY5zK6-(t1x;=49+M`rX-tT_4m
z8vP0)VR(J)s((xaPNmc+x<Y)+cA`fP6L9Rm-ri{c%4}A``^S>fYMDVTaafA^{WX-0
z##*sny5w_enjcC~y2P4Rcf{|Q)wi}Q!?KG5siorAj7Vn+vuqn4xGnw=k$M$oPt-dX
z;*oR}EKrdH;^W?**b{zWVmIcyy?D2lChF`x!(h1;UPh#4<t&<yu^=&7>a^H!RqtZG
zsAc|2fj&?7^b1GEk+J7#?>1J&#cEw{NtkVK;Fh_?<;E9B>F*u=EtAr&ZBcyH<t9lV
zcXlo{r)1b?c!1MKloQDJ4ghs7E0E&aucs&9c)T$<ebZuRv`^uyDP5=XwV*MTM@A^&
z7-bSOP&<d8+OvMg_V}I&y-V9m1s0nN&#p2jP3a4rYL0Teqec<I4Rcg)fAo4aHNCJ3
zaAgYjxia=o0*P;sdc81DlK!RAM3$cA1?m$l?PJS;-o-<3hMW7DeDi6go3+JG`azML
zt99<_lETM!79HLP9s<&OqkYCsX+B_(<P)o>w0Ju^H71UNuLA`;VB0aLNa@Mhe_CZh
z>~BjYvF5V=CHvVu6-YN15n5lkt4pAeUv8C<`pQa&gQJ`^{t};i0T3Va`wV5<T)pR=
zXW0u7QZxkA*q%MwH#c^{5aJtIfPOs?;PWhR<AC*0RlRteQIrthK5vPW4cI$vzSDmx
zN-wTLbtvMVz;7LcNl!Kax54~h-zU9=Sd{-1Plf@pquAW8c}Pk)R&0Ee8erj?YY-U~
z&pm%TjI1Bj5yhB3U%ry~BL~3zEB$n~-yV0ns(;bz^uWvH^>0qKJqLp!tGx&Jx+8kJ
z4JGG0-#!H@+h0?gl>v^iGIVe#!g^@NX}76EO4w9r$B&XN&q+_w#R9yl!m}oV2tRhy
z<Q#uA+RC^%bSkr0M1pqu%xKB!Vou{f)lG04v#BAD^(*%fB5Q2kiP@W9R)mzbj1;5R
zcq;tg*gD(nvv;y#Gq+LQF3S(*%X}zpqDCma`8)ki@~47;j(tE*mf~tGojA1I*5e((
z6#)u}fUQ+H`A@YW`fsZf_QEF+^G3&H=mqs{(84BKVq<{=KnMsrV4MKdJwtUlB|<>K
zvB&y@Rk^P*qt<zf&zSpb?Ft34k0KloG#v%>3UKANgq+&X^{M<O!=k&UTj{3v!sQWv
zm&xAR4mX3r0R_PZ+q|YCdlhD@(l&r^a}Il6${V3?KzI)2@XSbl@C?6arFKk3WS9Q!
zsRK&t;(B`#oeXPKGj@Ar32Emx58jw@#3N3;cW#-mS*@gAZjmb%RURm#a%GgYlnmSc
z@c*U+Y{lnPt4iCQ9~YGBe|%L1>#JO8`o-%<TU0K$*a#FHq8S2t4FK!@@pk^OCn+zo
z@dOz1l_)TcIN_F_>jXek(B@a?=I?R4nNq*41bK!Dy#!!V77ZG-V5iTRjnU04WBEFk
zfMO3K3GElFZha05-3A%tJP}}$qe?1`hH~&h`z13dcR|G-#4uVmBW#_c2^25|!#&?h
zR?~o+4Ua+qzDh+NaIYcw&wC9e3!^20c)|UG!-h>-dM`Zo^KLaY8h1Q7fqR`&_&%*v
zeiFa@_<=)?v)|C`_dfvs+&IQPHrqbty4qy`x3xH3tc%@=b|Fp;`nlcdZ@3Y8_Ul|u
zj0`H}im_(rei0pntAq|tt86gOXR2C@KYXnf8}*odWUQi_g<rb3<uXlVHH(}Xp<B6i
zIgk5NQt?o+IN@A((U5VmTNUl=Egir>GXakr_A?==jCuOVj&tXa7D^NoxC+>)#7u$e
zgeA`W0{e+vJ(c}7@z|;q0Pz8=LZDOZKzr_Y9|!Pds(%a09kJg;ci1k^)pbfSbC-4q
zOXnSe&vFY%+y4OGS5+rirn_iQnaWw3&jM5f02!+&aPOI{N3mOFoyIQWm-o8&nGD=>
z-sz4Dz~ZYDr+2rBFVK|>9VWYPNJHpQ890N*QQS<vx|F|ID;dxzotH4?>|iDF_51sf
z+>s>8eT91mfaXt$fOMWEkYv~Gw9Mbz8~Za8xPIv=Ig?@$NWJc8X|&nlCj#+vECSs4
z2frvslQ#JmTp1}0FVFpY`M^WAfBEqGR9Xc1ApK;pu#!__j2@KR{Bu<qKq1w#zD-cb
zTm|^A^eV&orw1}J(5IbYCZTsN%UV1w1*P|;d@vMY+pP)e1h{Y1d|PK^s|h30i~WzH
zBql?Yt-KRTM-O{g6N8${@4N8(1^K@bVu>I53qH3Lx3%_)<5({CNo13huLTmB>20Wv
z*rB^X^CAh2B&|(9N$Y09O=kytHbybb&dzFUYs-#!e@Zq`wkWogq2<ZA^(g|Eum}YP
z=7eI)inGcg8|>zH3|qNIuwlECNh-==J0bNSV|hb2;ITX+n+0YW^?)#P_N}$+v%p@+
z2|ri~Qi3ZOsW1<UOH&Ra&i+Q)%>Q6%byDI&#<w}{z6&CC0k#bE9dgaPJX}xzj4@&a
zvp+&km~3Z3*9KK+WWs2`crmeVrB!rJ%c|1d@XEkDyH6^+o1La8GwAntVKcN%u`60N
z@<@L5$2M_n$W&{J>+yn|P%36TG`Qb7AmN;R*^lBv|Aj^1hgF>Hd`}^yLw@?@5C>C}
zq#9c|#PwhO5<OEhhi{Z}RyIOlOKjw-@FX9cQ^KvU$Yro>N{bhIpGkCQN<1uG8A@Fr
z4<m`*lt_7+6Sh;W%u^_7D06S-%ae^G>!fkp`XGwAEz}SHS~VzfPM<PAQVr4F7)(x1
zrf25aCKC)&JIonB0GFmNvPUcZG1!^Q9UF`ro-02$`MzZ1ZrhxfduyKo7ehVw108W`
z(`Ju>cN4i!#3Uh>7|-JGBw!(H?B_BTdJ2;-DUOEJ95-E*amHWgEg-Wu<xmR~v2k9v
zuB`T1c@_H6$vWqoY3u2k4D-skngj&}VLV3>`dP5#<kgwHBxE;OM@Q14a44WDj@9u-
z)@O&-mxjT)uXtrW@0gjrvedm&=R4cIMZ$(r=6dJ2bAskBl`}scir}z$s7kvDMqRNI
zX9nFmqsDgq(ordCpe2)(nIQ>4_}QErr(Pshh(Nsql`tyC7plFbo{1po=$W2!N)h+O
zk?PkUW$W;$gl=00j8Z85wx}HBlz79&J!qd*PUKi!V0p!R{wh^SVy$a064>f6lcQpm
z?=N~sB2Sru{TGM)zC1fWSL{_Yi@54B^frK3&iihy{~`jMfiUYKRXE~cfvZGDuB&>2
zh#^NyOW0$3$`bY9t%`Tbs$n!rKPu4#8c3ZjK#iDey-H6n9G|bA&NfD2sdFgn8sEXH
z2YEr_kU&s43qX#^=gZuq+oBp02}93;VCZQ<PA)MJ_=U@Q_XjS|u8uZ5s)Z0cvbe}!
zGA$9t1^RuNuA?;-$QmLp1I0oX#UAza-rb?0>(^jNh5Dzis_bDA{l0Sn<ZGWwL?Aoc
zn?ZI^a?d$V)BKHv@=loddm$1UOYhM779X-Xl}hR8OqZH&6-7&@CR4Ng&9Butw7q(f
zgQHhW!XLVI?D79HD6D`2Vq}d9-*oGMi!3WzNm?T1o;9;VrV&DQA%~|EP47B5I9Q;+
zJY6_q$~`A4xu<R9p%bO56-x=SV^v<`%H=Xpq0vmcenIN^?v8m*-+C?HxSoVOh^6to
zykw6vv(6>1JIr?HySTXYt*_kpUS{}#?p0W+<HSlQ^gyKiTNUHEe%`R5<>sw&=$iT}
z(XFJ!yZ|wnm=cuNHiYpAP{xoCY?#L3-}97{QSAZUkRxJ)OXW}f-zZkPVjA=yH_KfO
zDoQ5SgrA(~Z|>@8SuCPo3)ywM48`U8*+I*!Qf$9Xy?8$K+NL|mDhSU=4GnASLfhW^
zD3eOODu6u+*4M9OO75u%m#;*wH<=WY)&-F(>wr_ql7|pmQ$$Tq&mZRZ5m5Zbh?EIF
zaSjIkb~nyp0RZq$qRck$Had&2g`3*J3q{J;x1T;$Z-IU+sgriI2)oa10r9Q1^b9Ne
zaxouYHxa{CR$gA*GO>M7m-%ktNicgFMfuG5u6_tJkK#k%cnKX6Xi!ZFm~2ic59Nwr
zcpI%8I=eN_NUG7G^GaZbZX3x1t`;z2q^^lzA+`4@2uaEkUw9|RlJVFb7Z(MP`T5=g
zAbW8;z?l{~SB&ySS{FW0+Z-~!I=4L5`?y&D$+-W_#+w8SAbDl-Q-q{DUW!IEYz_t0
z3@T{l6m+z<9;-{yqI^Hhap(0XUG_A~_8{4SL$Ys4)uo<)cz2*}2T4h`b4KsQ9P7=k
zc}Jz?xz3<P9`zdF(tYPybR!tJ9uBaFqhHI^j!MeOlG0#e#+3_&mkLs^@OoLlR&!0}
zmwk~@+C^Q_80)eR4x>&rechcgP6r+PpGDjn`j<w|bB^6f(_ON@sEg8{sJ^5z>nnLA
z%2iZPiM7?#%g0yQy{TXF8{Qd>pXXU-oQsTBl|8eedhA;0=6rywykp@?z)01D(13&N
z{@dN_*pt1}^As^g9!0=j8|Jj^TM~N(&RL8f4|h~5u@EY`GllS&L<(^PyKT}-G8fw}
z>+XEbV>J;9XL-yG3mWB|l?Rf2a>Sl$v*u0K+I!P2XRY&Gn&ZjrBj<OfRdy|Mx*E1u
z+NN<C997#u7?2KLTL8k31xe7{;;Q$z>riQl?_A{I`wltE(1YU9C+gTPl4Vliq(dvU
zDx>SzPj88`H=<_+k`!O0q~sre*q$cQSH1=+;_ro-2!S*nyUEfWxZadfCsU{gisJ)f
zQo+kN`=2iZpZiat_dm-4pXmoZz#?q>mz{qiep)5B9s<Pztr|tq%)U302R_IHR6+6M
zqkBT-9^(&vdk|d#*bPV@sDd8a-gQGV?7ge$d@zmy>xMJ_-h;+&XOi-O7qz2=0HntQ
zoB#;aVugY3#Z~N$=?x(0e3dhOCTx2W6Yz&vASP}|+u#2IZe42ugM&ccK+gVu=u6Yg
zpODp)GrbtE@j~N*A<Jl>Td{3%7=#wlWVva$PXa%W7+lm;lk#77Vh1U<qr$NB*g8b<
z$aGWlR%PMRNORU$=<I?bKrfwDL19DY)<Qubeq_6#FcA30Nl}Q}YX$&qbdQL65^muh
z<@g+E4zh{?luE^T{N<}^<zIk~=1~qn3)vdZr@7!3KLEra>IuNaG~kzhl7PSLvlS)O
z2$a|1!mO~*n$D-Ip=xXs!0qY}$wzTO&H7|S85kPS`#0^9sQ(^>I{4p!P>nR-C<N;F
z6#0e>=takA%Yv!-p0(@SGXDMbvw0;Ru1jLgk6GYk^*vfPP$!XOji-n+{uRDD$kqiV
z6@f|o1FVB)WwHFlD}!SN0@Bu_fxo8M)e$_|%(#_|e5JB>V;5t9_PMyt-V*Yl<DKAQ
zZ_^j1HfOMb2n}DP<S+?v;gOfUyk3}Ke5!kc6FUZ8Op406Z!VisdYi&O4^uLM^M!jM
zvLFw`KAg6rNCLNCDxhZvxBhGP2w~jF$j^E(qsq9&YC&X5Z`*rUi+<^yoxXC}fn(2&
zqynu8vJD<(_q4z=N{h(A`m_196~ZEQW=lm~h(6C=n~wS?o1=pv(_DfhJz!oxs22es
zJr`KZqapPkt{vvVMnmAmJfRYgZCZwT2S#jUCPx+(#yiwERxv|eFGg)SgOfz2@Bl~2
z>D!CNKU)Gb=^Pz8<IhqGx*<T#8zG(oT!WgY%N)eWRIJY)1)B$C<29;|f=SiWle$7s
zwjU~k$P4Ed3+|m+{Vcz`F?Y`T6RZ}_xw~Eq?Fcgzaq6WvXVW5u<zE;!j0jzSq}$UZ
zYOP#fJQyT3{$p|_RH{SC1D)-XXiJ5D8g;U%_|{^Qf`kNz-8~&6tMLhUy+#0cpsnev
z@a_$;uE?%MXH3w6tZbn7_@x;dCA7dVt<>D4Wm!?Al8w<^Z)q&{oAv@1`^=TL43+o<
z(}&q7))%%G8gKrLp=m3tNx36CI&*wJMbsJ7CEYo~(_?Zx02Z1sKGDvPp_{#P0tZCn
zvP7D@eii`DQC-M=2mt-;v(KCeRZt`JX6+vWf<1Y*i-0<3q{RANquTX3hgnuF;A=!q
z^afhbIc!%`%&Fn8I{4VFViQv`$YA4&=K(dl(>ZRPeFU_>MNjqERqp%c=*@NO;|hfs
z<KqFvsqg|XuZEqeHf2VLmlyS|Tb4-d?!zTmScqSuC;EFdu^q=KjMlaKX^%wnS&6@`
zm~U9^n=$^{aM!Nj73o@SM&LEtgGZ8G0Hme{{!G=i&CVM~wQ+Lx$sSuUNeK}3@97D2
z@)x*R+vSpY%aYmZEU}(!fNOhgiym+E$A)NJ;ZP@>E6^)s3D92Di8e)4-(P*)BSKK<
z|F_*1m!OeTkDEu!J*@V28*SkAGmdx3aw%CQ48m<-hl>a<y=4t`n^&w&qOVK_2+5v<
z>t2*jz7yK+squ%jsVlFXKazqfAB#4I!3!}(I_sZo%IU&fmGn2=W2b^@-bLeB@%Y`L
z>zU3+!M3Vy%A*L)Zy-(LX$pY!jxEB5<luGn23pQFHLLrMB`DsuBt5gc-NhK%V=OrN
zHc{8q0{MB#60aQ{XUkl8*Y!nazCn{!f$7O~yh%bYl-NrwaZ<8=YTOA_xop~VlXd=^
zEz{YR{UPVr!>FDa{OSbvD@}!va#o7A?PA%@Vh3WH@^NlhM25)MLI};~d^f(XZ4uW$
z@8++5?p@EWj`!M#{T=MTBb2G{Bmo>{_?KToq(-s9j$``j=V&MZ)LL?0;x(GJXBsWk
zk+Ql)K>7FL2}078;?|mcGABgI8U}Y+8?iJrD0Zfior-aN{9Ld;Q`b750tR2(OEHK|
z@QG!=^hg-hVJ7ZGP0q6p3f_deU#`6-tqVB7(m$8z@xo`<I`-Z2tYGvMjEOIcRjBci
zB)pL0sw#qLRX{mHsl7efEiN6IeVNJ998d%;F*JiR|K$sC6DDDsTGXRf{*ET@o&&vM
z9j5IZWOOMOQT@%w5fhZ<YThkHr(O=U;v5&+@0lZ#@V>#*`yQWs64F*Z{w}T$e|bem
zRZVIe-xHayHO&gP{iur5+^rjQs&?fk_uq65(z_H({DNLrUhVz5n{Ie`5N&wx-llOS
zw}P^Z{VQd^9hY?~@)FnqVG>s>bw&G%y_va+OyJqRe{s$AZ{EFo2jr=d;%8Y;;C`}S
zla5T}bm=JQY#@#k*MxEe7upXBGT=TKMs1s!AHTEX$vj_8X}oHB!Y;AtKCPI!G)>I0
zq>G%^t>WidPyfaBSDq~qs!jxM{2jBBZDP6kQH2{mJNH|Q5<>9yFV{jTE@Gpi?ytcX
zz!r)8bmTKO4-Htnom75=5v-J(JRa|el?XS{7?W6+#Fnby=H{T-HoYM4<r_f}jcVl=
z==|rTdm=$mw=ZYnrFUa~hVUglm^zCC?h=QV(Vhju0)gg)G9w#V8=_Y33qW=JH7qcm
zyj=KhW(n23jMtbeI)VO8ZGp79R8y7O1n;60y-e{V<&>^l(9^yl8y~>Y4JSUBdmCUl
zWu?9SXqur}991fn%4o-u3P$L%gB;vNd5CyzJgy`5W$we=2}eHqe_f$AlFKo49RCB+
znuX|H8}D<E<g^T#=&KQj26jwebDOJuVH)?AcQzLO<T#iXXuDfk=&40v6?<FJ_EZH8
zw(0k6dTfSlq|A^z{co2fO>9OZEWb?%F1Aa32d+R7o}L++divBRq4Ku43LZ7z*F*nQ
z)=?;3oKW+rucek_a+P{nHihG=0)oQD#0pd7{POAFl7s7V{FLyOqmuM8x<0K|ywMd*
z*O)u`{w=+jh-<`~)O6OleoQLtn?K|(QE2%zR{lKo-;$09Zw)8s9p86lfsz2zhj$>z
zD9|p_?xWP-2FsO_hVItVQ2pt%pW{AfD9;Tch@K(@)W1CjBrH*mrL3?Uo;mv=5cKnc
zTKT|H96#pIe97o?l+eiDotwSn=><l;+!h;3K!72^;`dC(8=wXHS*~vEdx@D!H0Yb%
yHZ2ig2{w!E6tIBmkY}{AK>!r;|73=_t>UwA=pI$&Y8xOGq<uqQqeSiQqyGV>oZs;P

literal 0
HcmV?d00001

diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index b6a44f76b..29305405b 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -14,3 +14,5 @@ add_subdirectory(11_add_rmsnorm2d_rdquant)
 add_subdirectory(12_smoothquant)
 add_subdirectory(13_moe_sorting)
 add_subdirectory(14_moe_smoothquant)
+add_subdirectory(15_fused_moe)
+
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 3b198502d..3cf0c2595 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -52,6 +52,7 @@
 #include "ck_tile/core/tensor/tile_elementwise.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
+#include "ck_tile/core/tensor/tile_window_utils.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
@@ -62,6 +63,7 @@
 #include "ck_tile/core/utility/philox_rand.hpp"
 #include "ck_tile/core/utility/random.hpp"
 #include "ck_tile/core/utility/reduce_operator.hpp"
+#include "ck_tile/core/utility/static_counter.hpp"
 #include "ck_tile/core/utility/to_sequence.hpp"
 #include "ck_tile/core/utility/transpose_vectors.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 3feede4d2..bebf035e9 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -621,6 +621,65 @@ CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
     asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
 }
 
+CK_TILE_DEVICE void lds_load_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt lgkmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+template <typename scalar_type, index_t N, bool pre_nop = false>
+struct buffer_atomic_add_if;
+
+template <bool pre_nop>
+struct buffer_atomic_add_if<bf16_t, 2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(v_offset),
+                       "v"(bit_cast<mbuf_t>(value)),
+                       "s"(res.xy),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <typename scalar_type, index_t N, bool pre_nop = false>
+struct buffer_atomic_add;
+
+template <bool pre_nop>
+struct buffer_atomic_add<bf16_t, 2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag = 1*/)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile("global_atomic_pk_add_bf16 %0, %1, %2 offset:%3"
+                     :
+                     : "v"(v_offset), "v"(bit_cast<mbuf_t>(value)), "s"(res.xy), "n"(i_offset)
+                     : "memory");
+    }
+};
+
 namespace impl {
 // below type indicate the data type used for buffer load inline asm
 // clang-format off
@@ -810,6 +869,11 @@ CK_TILE_DEVICE void buffer_store_fence(index_t cnt = 0)
     asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
 }
 
+CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
 // buffer load i8
 CK_TILE_DEVICE_EXTERN int8_t
 llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
@@ -2378,6 +2442,45 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_
 #endif
 }
 
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer<T, N>& src_thread_data,
+                                              T* p_dst_wave,
+                                              const index_t dst_thread_element_offset,
+                                              const index_t dst_linear_element_offset,
+                                              const bool dst_thread_element_valid,
+                                              const index_t dst_element_space_size,
+                                              bool_constant<pre_nop> = {})
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
+
+    if constexpr(oob_conditional_check)
+    {
+        buffer_atomic_add_if<T, N, pre_nop>{}(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              0,
+                                              dst_linear_addr_offset,
+                                              dst_thread_element_valid);
+    }
+    else
+    {
+        buffer_atomic_add<T, N, pre_nop>{}(src_thread_data,
+                                           dst_wave_buffer_resource,
+                                           dst_thread_addr_offset,
+                                           0,
+                                           dst_linear_addr_offset,
+                                           1);
+    }
+}
+
 // buffer_atomic_max requires:
 //   1) p_dst_wave must point to global memory
 //   2) p_dst_wave must be a wavewise pointer.
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 65a3a4e2f..afcf982a6 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -73,6 +73,24 @@ CK_TILE_DEVICE void block_sync_lds()
 #endif
 }
 
+CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
+{
+#ifdef __gfx12__
+    asm volatile("s_wait_loadcnt %0 \n"
+                 "s_barrier_signal -1 \n"
+                 "s_barrier_wait -1"
+                 :
+                 : "n"(cnt)
+                 : "memory");
+#else
+    asm volatile("s_waitcnt vmcnt(%0) \n"
+                 "s_barrier"
+                 :
+                 : "n"(cnt)
+                 : "memory");
+#endif
+}
+
 CK_TILE_DEVICE void block_sync_lds_direct_load()
 {
     asm volatile("\
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
index a88780459..df0f54c5e 100644
--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -102,4 +102,28 @@ CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
 #endif
 }
 
+template <typename T>
+CK_TILE_DEVICE auto flag_to_exec(const T& v_flag)
+{
+    static_assert(sizeof(T) == 4);
+    // per-thread v_flag store into 2x sgpr
+    uint32x2_t exec_flag;
+    asm volatile("v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1"
+                 : [s_exec_flag] "=s"(exec_flag)
+                 : [v_flag] "v"(v_flag));
+    return exec_flag;
+}
+
+template <typename X, typename Y>
+CK_TILE_DEVICE auto cmp_lt_to_exec(const X& x, const Y& y)
+{
+    static_assert(sizeof(X) == 4 && sizeof(Y) == 4);
+    // per-thread cmp store into 2x sgpr
+    uint32x2_t exec_flag;
+    asm volatile("v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]"
+                 : [s_exec_flag] "=s"(exec_flag)
+                 : [v_x] "v"(x), [v_y] "v"(y));
+    return exec_flag;
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 2cc788d42..7dffa0e55 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -437,34 +437,74 @@ struct buffer_view<address_space_enum::global,
     // i is offset of T, not X. i should be aligned to X
     template <memory_operation_enum Op,
               typename X,
+              bool oob_conditional_check = true,
               typename std::enable_if<
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i,
+                               index_t linear_offset,
+                               bool is_valid_element,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
     {
         if constexpr(Op == memory_operation_enum::set)
         {
-            this->template set<X>(i, linear_offset, is_valid_element, x);
+            this->template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
         }
         else if constexpr(Op == memory_operation_enum::atomic_add)
         {
-            this->template atomic_add<X>(i, linear_offset, is_valid_element, x);
+            this->template atomic_add<X, oob_conditional_check>(
+                i, linear_offset, is_valid_element, x);
         }
         else if constexpr(Op == memory_operation_enum::atomic_max)
         {
-            this->template atomic_max<X>(i, linear_offset, is_valid_element, x);
+            this->template atomic_max<X, oob_conditional_check>(
+                i, linear_offset, is_valid_element, x);
         }
         // FIXME: remove memory_operation_enum::add
         else if constexpr(Op == memory_operation_enum::add)
         {
-            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
-            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
+            auto tmp =
+                this->template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
+            this->template set<X, oob_conditional_check>(
+                i, linear_offset, is_valid_element, x + tmp);
             // tmp += x;
             // this->template set<X>(i, is_valid_element, tmp);
         }
     }
 
+    // i is offset of T, not X. i should be aligned to X
+    template <memory_operation_enum Op,
+              typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE void update_raw(index_t i,
+                                   index_t linear_offset,
+                                   bool is_valid_element,
+                                   const X& x,
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {})
+    {
+        if constexpr(Op == memory_operation_enum::set)
+        {
+            this->template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
+        }
+        else if constexpr(Op == memory_operation_enum::atomic_add)
+        {
+            this->template atomic_add_raw<X, oob_conditional_check, pre_nop>(
+                i, linear_offset, is_valid_element, x);
+        }
+        else if constexpr(Op == memory_operation_enum::atomic_max)
+        {
+            // this->template atomic_max_raw<X>(i, linear_offset, is_valid_element, x);
+        }
+    }
+
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
               bool oob_conditional_check = true,
@@ -533,6 +573,7 @@ struct buffer_view<address_space_enum::global,
     }
 
     template <typename X,
+              bool oob_conditional_check = true,
               typename std::enable_if<
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
@@ -585,6 +626,39 @@ struct buffer_view<address_space_enum::global,
     }
 
     template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE void
+    atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
+    {
+        // using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
+
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        static_assert(get_address_space() == address_space_enum::global, "only support global mem");
+
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+        amd_buffer_atomic_add_raw<remove_cvref_t<T>,
+                                  t_per_x,
+                                  Coherence,
+                                  oob_conditional_check,
+                                  pre_nop>(
+            x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
               typename std::enable_if<
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index f150fc54c..b280a1725 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -22,28 +22,32 @@ template <typename BottomTensorView_,
           typename WindowLengths_,
           typename TileDistribution_,
           index_t NumCoord,
+          index_t i_access           = -1,
           bool oob_conditional_check = true>
 CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution<BottomTensorView_,
                                                                          WindowLengths_,
                                                                          TileDistribution_,
                                                                          NumCoord>& tile_window,
+                              number<i_access>                     = {},
                               bool_constant<oob_conditional_check> = {})
 {
-    return tile_window.load(number<-1>{}, bool_constant<oob_conditional_check>{});
+    return tile_window.load(number<i_access>{}, bool_constant<oob_conditional_check>{});
 }
 
 template <typename BottomTensorView_,
           typename WindowLengths_,
           typename TileDistribution_,
           typename LinearBottomDims_,
+          index_t i_access           = -1,
           bool oob_conditional_check = true>
 CK_TILE_DEVICE auto load_tile(const tile_window_linear<BottomTensorView_,
                                                        WindowLengths_,
                                                        TileDistribution_,
                                                        LinearBottomDims_>& tile_window,
+                              number<i_access>                     = {},
                               bool_constant<oob_conditional_check> = {})
 {
-    return tile_window.load(number<-1>{}, bool_constant<oob_conditional_check>{});
+    return tile_window.load(number<i_access>{}, bool_constant<oob_conditional_check>{});
 }
 
 template <typename DistributedTensor_,
@@ -51,15 +55,35 @@ template <typename DistributedTensor_,
           typename WindowLengths_,
           typename TileDistribution_,
           index_t NumCoord,
+          index_t i_access           = -1,
           bool oob_conditional_check = true>
 CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
                               const tile_window_with_static_distribution<BottomTensorView_,
                                                                          WindowLengths_,
                                                                          TileDistribution_,
                                                                          NumCoord>& tile_window,
+                              number<i_access>                     = {},
                               bool_constant<oob_conditional_check> = {})
 {
-    return tile_window.load(dst_tile, bool_constant<oob_conditional_check>{});
+    return tile_window.load(dst_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
+template <typename DistributedTensor_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
+                              const tile_window_linear<BottomTensorView_,
+                                                       WindowLengths_,
+                                                       TileDistribution_,
+                                                       LinearBottomDims_>& tile_window,
+                              number<i_access>                     = {},
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(dst_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
 }
 
 /**
@@ -76,6 +100,7 @@ template <typename T,
           typename WindowLengths_,
           typename TileDistribution_,
           index_t NumCoord,
+          index_t i_access           = -1,
           bool oob_conditional_check = true,
           bool pre_nop               = false>
 CK_TILE_DEVICE auto load_tile_raw(T& tile,
@@ -83,11 +108,12 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
                                                                              WindowLengths_,
                                                                              TileDistribution_,
                                                                              NumCoord>& tile_window,
+                                  number<i_access>                     = {},
                                   bool_constant<oob_conditional_check> = {},
                                   bool_constant<pre_nop>               = {})
 {
     tile_window.load_raw(
-        tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+        tile, number<i_access>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 
 template <typename T,
@@ -95,6 +121,7 @@ template <typename T,
           typename WindowLengths_,
           typename TileDistribution_,
           typename LinearBottomDims_,
+          index_t i_access           = -1,
           bool oob_conditional_check = true,
           bool pre_nop               = false>
 CK_TILE_DEVICE auto load_tile_raw(T& tile,
@@ -102,11 +129,12 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
                                                            WindowLengths_,
                                                            TileDistribution_,
                                                            LinearBottomDims_>& tile_window,
+                                  number<i_access>                     = {},
                                   bool_constant<oob_conditional_check> = {},
                                   bool_constant<pre_nop>               = {})
 {
     tile_window.load_raw(
-        tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+        tile, number<i_access>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 
 template <typename LdsTileWindow_,
@@ -114,6 +142,7 @@ template <typename LdsTileWindow_,
           typename WindowLengths_,
           typename TileDistribution_,
           index_t NumCoord,
+          index_t i_access           = -1,
           bool oob_conditional_check = true,
           bool pre_nop               = false>
 CK_TILE_DEVICE auto
@@ -122,11 +151,14 @@ async_load_tile_raw(LdsTileWindow_&& lds_tile,
                                                                WindowLengths_,
                                                                TileDistribution_,
                                                                NumCoord>& tile_window,
+                    number<i_access>                     = {},
                     bool_constant<oob_conditional_check> = {},
                     bool_constant<pre_nop>               = {})
 {
-    return tile_window.async_load_raw(
-        lds_tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+    return tile_window.async_load_raw(lds_tile,
+                                      number<i_access>{},
+                                      bool_constant<oob_conditional_check>{},
+                                      bool_constant<pre_nop>{});
 }
 
 template <typename LdsTileWindow_,
@@ -134,6 +166,7 @@ template <typename LdsTileWindow_,
           typename WindowLengths_,
           typename TileDistribution_,
           typename LinearBottomDims_,
+          index_t i_access           = -1,
           bool oob_conditional_check = true,
           bool pre_nop               = false>
 CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile,
@@ -141,11 +174,14 @@ CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile,
                                                                  WindowLengths_,
                                                                  TileDistribution_,
                                                                  LinearBottomDims_>& tile_window,
+                                        number<i_access>                     = {},
                                         bool_constant<oob_conditional_check> = {},
                                         bool_constant<pre_nop>               = {})
 {
-    return tile_window.async_load_raw(
-        lds_tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+    return tile_window.async_load_raw(lds_tile,
+                                      number<i_access>{},
+                                      bool_constant<oob_conditional_check>{},
+                                      bool_constant<pre_nop>{});
 }
 
 CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0)
diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
index 29c20bed0..568d618ec 100644
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -201,4 +201,30 @@ CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number
     return unpacks;
 }
 
+namespace detail {
+
+// check if 2 static_distributed_tensor has same data type and size of element
+// but only difference in distribution
+template <typename X, typename Y>
+struct is_similiar_distributed_tensor
+{
+    static constexpr bool value = false;
+};
+
+template <typename TypeX, typename DistX, typename TypeY, typename DistY>
+struct is_similiar_distributed_tensor<static_distributed_tensor<TypeX, DistX>,
+                                      static_distributed_tensor<TypeY, DistY>>
+{
+    using Tx                    = static_distributed_tensor<TypeX, DistX>;
+    using Ty                    = static_distributed_tensor<TypeY, DistY>;
+    static constexpr bool value = std::is_same_v<typename Tx::DataType, typename Ty::DataType> &&
+                                  Tx::get_thread_buffer_size() == Ty::get_thread_buffer_size();
+};
+
+template <typename X, typename Y>
+inline constexpr bool is_similiar_distributed_tensor_v =
+    is_similiar_distributed_tensor<X, Y>::value;
+
+} // namespace detail
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 698ce5378..4c72ed085 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -333,6 +333,48 @@ struct tensor_view
             coord.get_offset(), linear_offset, is_valid_element, x);
     }
 
+    // X is vector of DataType.
+    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements_raw(const TensorCoord& coord,
+                                   index_t linear_offset,
+                                   const X& x,
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {})
+    {
+        buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
+            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            x);
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements_raw(const TensorCoord& coord,
+                                   index_t linear_offset,
+                                   bool is_valid_element,
+                                   const X& x,
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {})
+    {
+        buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
     CK_TILE_HOST_DEVICE void print() const
     {
         printf("tensor_view{");
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index e41024698..caeb03852 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -292,12 +292,15 @@ struct tile_window_with_static_distribution
     {
         constexpr auto tile_dstr = TileDstr{};
         auto dst_tensor          = make_static_distributed_tensor<DataType>(tile_dstr);
-        load(dst_tensor, bool_constant<oob_conditional_check>{});
+        load(dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
         return dst_tensor;
     }
 
-    template <typename DistributedTensor, bool oob_conditional_check = true>
+    template <typename DistributedTensor,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
     CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+                             number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
     {
         using Traits   = load_store_traits;
@@ -785,6 +788,73 @@ struct tile_window_with_static_distribution
         });
     }
 
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true, bool pre_nop>
+    CK_TILE_DEVICE void update_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                   number<i_access_unsupport_>          = {},
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {}) const
+    {
+        using Traits = load_store_traits;
+
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // data index [y0, y1, ...]
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+
+                // read from distributed tensor
+                vector_t vec_value;
+
+                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                    constexpr auto idx_ys = generate_tuple(
+                        [&](auto jj) {
+                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                            : idx_ys_start[jj];
+                        },
+                        number<NDimY>{});
+
+                    constexpr index_t d =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                    vec_value.template get_as<DataType>()(j) =
+                        dstr_tensor.get_thread_buffer().template at<d>();
+                });
+
+                // write into bottom tensor
+                get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{},
+                    bool_constant<pre_nop>{});
+
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                }
+            });
+        });
+    }
+
     // move thread's botom tensor coordiante
     // [x0', x1', ... ] ==> [offset]
     // also move window-origin
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index 4b921ec5b..96a8352c0 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -432,23 +432,38 @@ struct tile_window_linear
     CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number<i_access>)
     {
         constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
-        // since this is linear offset, we assum bottom X tensor is always linear
-        constexpr index_t linear_offset = [&]() {
-            constexpr auto x_idx_ = linear_coord;
-            constexpr auto x_len_ = TileDstr{}.get_lengths();
-            static_assert(x_idx_.size() == x_len_.size());
-            constexpr index_t x_dims_ = x_idx_.size();
-            index_t cu_stride_        = 1;
-            index_t cu_offset_        = 0;
-            static_for<0, x_dims_, 1>{}([&](auto i_) {
-                auto r_i_ = number<x_dims_ - i_ - 1>{};
-                cu_offset_ += x_idx_[r_i_] * cu_stride_;
-                cu_stride_ *= x_len_[r_i_];
-            });
-            return cu_offset_;
-        }();
-
-        return linear_offset;
+        constexpr auto is_pure_linear_tensor =
+            reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{});
+        if constexpr(is_pure_linear_tensor)
+        {
+            // this case usually is a LDS window, everything is known at compile tile.
+            // we directly use BottomTensorView transform to compute the offset, in case padding
+            auto bottom_tensor_coord =
+                make_tensor_coordinate(BottomTensorView{}.get_tensor_descriptor(), linear_coord);
+            return bottom_tensor_coord.get_offset();
+        }
+        else
+        {
+            // this case usually is a global window, where last dim can be linear
+            // we hack here, that use the original TileDstr to compute the linear offset
+            // ... hoping that there is no extra padding between other dims, which make sense
+            // since that would introduce runtime length (so can't use linear offset)
+            constexpr index_t linear_offset = [&]() {
+                constexpr auto x_idx_ = linear_coord;
+                constexpr auto x_len_ = TileDstr{}.get_lengths();
+                static_assert(x_idx_.size() == x_len_.size());
+                constexpr index_t x_dims_ = x_idx_.size();
+                index_t cu_stride_        = 1;
+                index_t cu_offset_        = 0;
+                static_for<0, x_dims_, 1>{}([&](auto i_) {
+                    auto r_i_ = number<x_dims_ - i_ - 1>{};
+                    cu_offset_ += x_idx_[r_i_] * cu_stride_;
+                    cu_stride_ *= x_len_[r_i_];
+                });
+                return cu_offset_;
+            }();
+            return linear_offset;
+        }
     }
 
     CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }
@@ -509,6 +524,64 @@ struct tile_window_linear
         return dst_tensor;
     }
 
+    template <typename DstTile, index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(DstTile& dst_tensor,
+                             number<i_access>                     = {},
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
+
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess = number<i_access_>{};
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
+
+            // read from bottom tensor
+            const vector_t vec_value =
+                get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord,
+                    linear_offset,
+                    bottom_tensor_flag,
+                    bool_constant<oob_conditional_check>{});
+#if 1
+            // data index [y0, y1, ...]
+            constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
+            // write into distributed tensor
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value.template get_as<DataType>()[j];
+            });
+#else
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+
+            dst_tensor.get_thread_buffer().template get_as<vector_t>()(
+                number<d / traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
+#endif
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+
+        return dst_tensor;
+    }
+
     template <typename DstTile,
               index_t i_access           = -1,
               bool oob_conditional_check = true,
@@ -849,6 +922,58 @@ struct tile_window_linear
         WINDOW_DISPATCH_ISSUE();
     }
 
+    template <index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
+    CK_TILE_DEVICE void update_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                   number<i_access>                     = {},
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {}) const
+    {
+
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{},
+                bool_constant<pre_nop>{});
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
     // move thread's botom tensor coordiante
     // [x0', x1', ... ] ==> [offset]
     // also move window-origin
diff --git a/include/ck_tile/core/tensor/tile_window_utils.hpp b/include/ck_tile/core/tensor/tile_window_utils.hpp
new file mode 100644
index 000000000..71a72329f
--- /dev/null
+++ b/include/ck_tile/core/tensor/tile_window_utils.hpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+#pragma once
+namespace ck_tile {
+
+// input a lds store tile, extract some information from it
+// used to set m0 value for gfx9 serious
+template <typename LdsTileWindow_>
+CK_TILE_DEVICE auto get_async_store_smem_info(LdsTileWindow_&& lds_tile)
+{
+    using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+    using LdsDataType   = typename LdsTileWindow::DataType;
+
+    // issues * warps * lanes
+    static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+    const index_t size_per_buf =
+        lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+            make_tuple(number<0>{}, number<0>{}, number<0>{})) *
+        sizeof(LdsDataType);
+
+    const index_t size_per_wave =
+        lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+            make_tuple(number<0>{}, number<1>{}, number<0>{})) *
+            sizeof(LdsDataType) -
+        size_per_buf;
+
+    const index_t size_per_issue =
+        lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+            make_tuple(number<1>{}, number<0>{}, number<0>{})) *
+            sizeof(LdsDataType) -
+        size_per_buf;
+
+    const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+
+    return make_tuple(m0_init_value, size_per_issue);
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/update_tile.hpp b/include/ck_tile/core/tensor/update_tile.hpp
index fbce7c408..570abde18 100644
--- a/include/ck_tile/core/tensor/update_tile.hpp
+++ b/include/ck_tile/core/tensor/update_tile.hpp
@@ -41,15 +41,65 @@ template <typename BottomTensorView_,
           typename WindowLengths_,
           typename TileDistribution_,
           index_t NumCoord,
-          typename DataType_>
+          typename DataType_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true>
 CK_TILE_DEVICE void
 update_tile(tile_window_with_static_distribution<BottomTensorView_,
                                                  WindowLengths_,
                                                  TileDistribution_,
                                                  NumCoord>& tile_window,
-            const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+            const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor,
+            number<i_access>                     = {},
+            bool_constant<oob_conditional_check> = {})
 {
-    tile_window.update(dstr_tensor);
+    tile_window.update(dstr_tensor, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          index_t NumCoord,
+          typename DataType_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE void
+update_tile_raw(tile_window_with_static_distribution<BottomTensorView_,
+                                                     WindowLengths_,
+                                                     TileDistribution_,
+                                                     NumCoord>& tile_window,
+                const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor,
+                number<i_access>                     = {},
+                bool_constant<oob_conditional_check> = {},
+                bool_constant<pre_nop>               = {})
+{
+    tile_window.update_raw(dstr_tensor,
+                           number<i_access>{},
+                           bool_constant<oob_conditional_check>{},
+                           bool_constant<pre_nop>{});
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          typename DataType_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE auto update_tile_raw(
+    tile_window_linear<BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_>&
+        tile_window,
+    const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor,
+    number<i_access>                     = {},
+    bool_constant<oob_conditional_check> = {},
+    bool_constant<pre_nop>               = {})
+{
+    tile_window.update_raw(dstr_tensor,
+                           number<i_access>{},
+                           bool_constant<oob_conditional_check>{},
+                           bool_constant<pre_nop>{});
 }
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/utility/static_counter.hpp b/include/ck_tile/core/utility/static_counter.hpp
new file mode 100644
index 000000000..84af3dd52
--- /dev/null
+++ b/include/ck_tile/core/utility/static_counter.hpp
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+
+namespace ck_tile {
+
+template <typename Context, index_t Start = 0, index_t Step = 1>
+struct static_counter
+{
+    public:
+    template <typename Unique>
+    static constexpr index_t next()
+    {
+        return next<Unique>(0) * Step + Start;
+    }
+
+    template <unsigned long long>
+    static constexpr index_t next()
+    {
+        struct Unique
+        {
+        };
+        return next<Unique>(0) * Step + Start;
+    }
+
+    template <typename Unique>
+    static constexpr index_t current()
+    {
+        return current<Unique>(0) * Step + Start;
+    }
+
+    template <unsigned long long>
+    static constexpr index_t current()
+    {
+        struct Unique
+        {
+        };
+        return current<Unique>(0) * Step + Start;
+    }
+
+    private:
+    template <index_t I>
+    struct slot
+    {
+        _Pragma("GCC diagnostic push");
+        _Pragma("GCC diagnostic ignored \"-Wundefined-internal\"");
+        friend constexpr bool slot_allocated(slot<I>);
+        _Pragma("GCC diagnostic pop");
+    };
+
+    template <index_t I>
+    struct allocate_slot
+    {
+        friend constexpr bool slot_allocated(slot<I>) { return true; }
+        enum
+        {
+            value = I
+        };
+    };
+
+    // If slot_allocated(slot<I>) has NOT been defined, then SFINAE will keep this function out of
+    // the overload set...
+    template <typename Unique, index_t I = 0, bool = slot_allocated(slot<I>())>
+    static constexpr index_t next(index_t)
+    {
+        return next<Unique, I + 1>(0);
+    }
+
+    // ...And this function will be used, instead, which will define slot_allocated(slot<I>) via
+    // allocate_slot<I>.
+    template <typename Unique, index_t I = 0>
+    static constexpr index_t next(double)
+    {
+        return allocate_slot<I>::value;
+    }
+
+    // If slot_allocated(slot<I>) has NOT been defined, then SFINAE will keep this function out of
+    // the overload set...
+    template <typename Unique, index_t I = Start, bool = slot_allocated(slot<I>())>
+    static constexpr index_t current(index_t)
+    {
+        return current<Unique, I + 1>(0);
+    }
+
+    // ...And this function will be used, instead, which will return the current counter, or assert
+    // in case next() hasn't been called yet.
+    template <typename Unique, index_t I = Start>
+    static constexpr index_t current(double)
+    {
+        static_assert(I != 0, "You must invoke next() first");
+
+        return I - 1;
+    }
+};
+
+namespace impl {
+template <int I>
+struct static_counter_uniq_;
+}
+
+#define MAKE_SC() \
+    ck_tile::static_counter<ck_tile::impl::static_counter_uniq_<__COUNTER__>> {}
+#define MAKE_SC_WITH(start_, step_) \
+    ck_tile::static_counter<ck_tile::impl::static_counter_uniq_<__COUNTER__>, start_, step_> {}
+#define NEXT_SC(c_) c_.next<__COUNTER__>()
+#define NEXT_SCI(c_, static_i_) c_.next<__COUNTER__ + static_i_>()
+
+// Usage:
+// constexpr auto c = MAKE_SC()
+// NEXT_SC(c)    // -> constexpr 0
+// NEXT_SC(c)    // -> constexpr 1
+// NEXT_SC(c)    // -> constexpr 2
+} // namespace ck_tile
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 2e96009ac..2f3a302ee 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -11,6 +11,7 @@
 #include "ck_tile/host/fill.hpp"
 #include "ck_tile/host/hip_check_error.hpp"
 #include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/host/joinable_thread.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/host/ranges.hpp"
 #include "ck_tile/host/reference/reference_batched_dropout.hpp"
@@ -20,6 +21,7 @@
 #include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp"
 #include "ck_tile/host/reference/reference_batched_softmax.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
+#include "ck_tile/host/reference/reference_fused_moe.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
diff --git a/include/ck_tile/host/device_memory.hpp b/include/ck_tile/host/device_memory.hpp
index 7c8549f74..13684c0e2 100644
--- a/include/ck_tile/host/device_memory.hpp
+++ b/include/ck_tile/host/device_memory.hpp
@@ -7,6 +7,7 @@
 #include <stdint.h>
 #include <stdexcept>
 #include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/host_tensor.hpp"
 
 namespace ck_tile {
 template <typename T>
@@ -36,6 +37,19 @@ struct DeviceMem
             mpDeviceBuf = nullptr;
         }
     }
+    template <typename T>
+    DeviceMem(const HostTensor<T>& t) : mMemSize(t.get_element_space_size_in_bytes())
+    {
+        if(mMemSize != 0)
+        {
+            HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        }
+        else
+        {
+            mpDeviceBuf = nullptr;
+        }
+        ToDevice(t.data());
+    }
     void Realloc(std::size_t mem_size)
     {
         if(mpDeviceBuf)
@@ -92,6 +106,27 @@ struct DeviceMem
             HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
         }
     }
+
+    // construct a host tensor with type T
+    template <typename T>
+    HostTensor<T> ToHost(std::size_t cpySize)
+    {
+        // TODO: host tensor could be slightly larger than the device tensor
+        // we just copy all data from GPU buffer
+        std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T);
+        HostTensor<T> h_({host_elements});
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
+        }
+        return h_;
+    }
+    template <typename T>
+    HostTensor<T> ToHost()
+    {
+        return ToHost<T>(mMemSize);
+    }
+
     void SetZero() const
     {
         if(mpDeviceBuf)
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 335911860..f24c33875 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -13,6 +13,7 @@
 #include <unordered_set>
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/host/joinable_thread.hpp"
 
 namespace ck_tile {
 
@@ -22,13 +23,44 @@ struct FillUniformDistribution
     float a_{-5.f};
     float b_{5.f};
     std::optional<uint32_t> seed_{11939};
+    // ATTENTION: threaded does not guarantee the distribution between thread
+    bool threaded = false;
 
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
-        std::uniform_real_distribution<float> dis(a_, b_);
-        std::generate(first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
+        if(threaded)
+        {
+            uint32_t num_thread  = std::thread::hardware_concurrency();
+            auto total           = static_cast<std::size_t>(std::distance(first, last));
+            auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
+
+            std::vector<joinable_thread> threads(num_thread);
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end   = std::min((it + 1) * work_per_thread, total);
+                auto thread_f        = [this, total, iw_begin, iw_end, &first] {
+                    if(iw_begin > total || iw_end > total)
+                        return;
+                    // need to make each thread unique, add an offset to current seed
+                    std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
+                                                              : std::random_device{}());
+                    std::uniform_real_distribution<float> dis(a_, b_);
+                    std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
+                        return ck_tile::type_convert<T>(dis(gen));
+                    });
+                };
+                threads[it] = joinable_thread(thread_f);
+            }
+        }
+        else
+        {
+            std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
+            std::uniform_real_distribution<float> dis(a_, b_);
+            std::generate(
+                first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
+        }
     }
 
     template <typename ForwardRange>
@@ -115,13 +147,44 @@ struct FillNormalDistribution
     float mean_{0.f};
     float variance_{1.f};
     std::optional<uint32_t> seed_{11939};
+    // ATTENTION: threaded does not guarantee the distribution between thread
+    bool threaded = false;
 
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
-        std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
-        std::generate(first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
+        if(threaded)
+        {
+            uint32_t num_thread  = std::thread::hardware_concurrency();
+            auto total           = static_cast<std::size_t>(std::distance(first, last));
+            auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
+
+            std::vector<joinable_thread> threads(num_thread);
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end   = std::min((it + 1) * work_per_thread, total);
+                auto thread_f        = [this, total, iw_begin, iw_end, &first] {
+                    if(iw_begin > total || iw_end > total)
+                        return;
+                    // need to make each thread unique, add an offset to current seed
+                    std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
+                                                              : std::random_device{}());
+                    std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
+                    std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
+                        return ck_tile::type_convert<T>(dis(gen));
+                    });
+                };
+                threads[it] = joinable_thread(thread_f);
+            }
+        }
+        else
+        {
+            std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
+            std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
+            std::generate(
+                first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
+        }
     }
 
     template <typename ForwardRange>
@@ -235,6 +298,44 @@ struct FillMonotonicSeq
     }
 };
 
+template <typename T, bool IsAscending = true>
+struct FillStepRange
+{
+    float start_value_{0};
+    float end_value_{3};
+    float step_{1};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::generate(first, last, [=, n = start_value_]() mutable {
+            auto tmp = n;
+            n += step_;
+            if constexpr(IsAscending)
+            {
+                if(n > end_value_)
+                    n = start_value_;
+            }
+            else
+            {
+                if(n < end_value_)
+                    n = start_value_;
+            }
+
+            return type_convert<T>(tmp);
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const -> std::void_t<
+        decltype(std::declval<const FillStepRange&>()(std::begin(std::forward<ForwardRange>(range)),
+                                                      std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
 template <typename T>
 struct FillConstant
 {
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index 5610ba324..3902cad17 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -8,12 +8,13 @@
 #include <iostream>
 #include <iomanip>
 #include <numeric>
-#include <thread>
 #include <utility>
 #include <vector>
 #include <functional>
+#include <fstream>
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/host/joinable_thread.hpp"
 #include "ck_tile/host/ranges.hpp"
 
 namespace ck_tile {
@@ -213,23 +214,6 @@ CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old
     return HostTensorDescriptor(new_lengths, new_strides);
 }
 
-struct joinable_thread : std::thread
-{
-    template <typename... Xs>
-    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
-    {
-    }
-
-    joinable_thread(joinable_thread&&) = default;
-    joinable_thread& operator=(joinable_thread&&) = default;
-
-    ~joinable_thread()
-    {
-        if(this->joinable())
-            this->join();
-    }
-};
-
 template <typename F, typename... Xs>
 struct ParallelTensorFunctor
 {
@@ -590,6 +574,107 @@ struct HostTensor
                                       size() * FromSize / ToSize};
     }
 
+    friend std::ostream& operator<<(std::ostream& os, const HostTensor<T>& t)
+    {
+        os << t.mDesc;
+        os << "[";
+        for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx)
+        {
+            if(0 < idx)
+            {
+                os << ", ";
+            }
+            if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t>)
+            {
+                os << type_convert<float>(t.mData[idx]) << " #### ";
+            }
+            else
+            {
+                os << t.mData[idx];
+            }
+        }
+        os << "]";
+        return os;
+    }
+
+    // read data from a file, as dtype
+    // the file could dumped from torch as (targeting tensor is t here)
+    // numpy.savetxt("f.txt", t.view(-1).numpy())
+    // numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save
+    // numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d")   # save as int
+    // will output f.txt, each line is a value
+    // dtype=float or int, internally will cast to real type
+    void loadtxt(std::string file_name, std::string dtype = "float")
+    {
+        std::ifstream file(file_name);
+
+        if(file.is_open())
+        {
+            std::string line;
+
+            index_t cnt = 0;
+            while(std::getline(file, line))
+            {
+                if(cnt >= static_cast<index_t>(mData.size()))
+                {
+                    throw std::runtime_error(std::string("data read from file:") + file_name +
+                                             " is too big");
+                }
+
+                if(dtype == "float")
+                {
+                    mData[cnt] = type_convert<T>(std::stof(line));
+                }
+                else if(dtype == "int" || dtype == "int32")
+                {
+                    mData[cnt] = type_convert<T>(std::stoi(line));
+                }
+                cnt++;
+            }
+            file.close();
+            if(cnt < static_cast<index_t>(mData.size()))
+            {
+                std::cerr << "Warning! reading from file:" << file_name
+                          << ", does not match the size of this tensor" << std::endl;
+            }
+        }
+        else
+        {
+            // Print an error message to the standard error
+            // stream if the file cannot be opened.
+            throw std::runtime_error(std::string("unable to open file:") + file_name);
+        }
+    }
+
+    // can save to a txt file and read from torch as:
+    // torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous()
+    void savetxt(std::string file_name, std::string dtype = "float")
+    {
+        std::ofstream file(file_name);
+
+        if(file.is_open())
+        {
+            for(auto& itm : mData)
+            {
+                if(dtype == "float")
+                    file << type_convert<float>(itm) << std::endl;
+                else if(dtype == "int")
+                    file << type_convert<int>(itm) << std::endl;
+                else
+                    // TODO: we didn't implement operator<< for all custom
+                    // data types, here fall back to float in case compile error
+                    file << type_convert<float>(itm) << std::endl;
+            }
+            file.close();
+        }
+        else
+        {
+            // Print an error message to the standard error
+            // stream if the file cannot be opened.
+            throw std::runtime_error(std::string("unable to open file:") + file_name);
+        }
+    }
+
     Descriptor mDesc;
     Data mData;
 };
diff --git a/include/ck_tile/host/joinable_thread.hpp b/include/ck_tile/host/joinable_thread.hpp
new file mode 100644
index 000000000..a822f967d
--- /dev/null
+++ b/include/ck_tile/host/joinable_thread.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <thread>
+#include <utility>
+
+namespace ck_tile {
+
+struct joinable_thread : std::thread
+{
+    template <typename... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {
+    }
+
+    joinable_thread(joinable_thread&&) = default;
+    joinable_thread& operator=(joinable_thread&&) = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_fused_moe.hpp b/include/ck_tile/host/reference/reference_fused_moe.hpp
new file mode 100644
index 000000000..bf89f9275
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_fused_moe.hpp
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+// [indexing implementation-1]
+// using M_a as constexpr block_size to partition all tokens into different slices
+// each slice map to one expert, and one expert can have multiple slices
+// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float
+//           number)
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+//
+// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
+// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated)
+// * this could be larger than actual, since actual tokens are on GPU
+//
+// sorted_token_ids_ptr   : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6,
+// 0, 1, 2, 5]
+//                          |-  exp-0  -|-  exp-1  -|-  exp-2  -|-      exp-3          -|-  exp-4
+//                          -|-  exp-5  -|
+// sorted_weight_ptr      : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *,
+// c, f, i, o]
+//
+// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
+//
+// sorted_expert_ids_ptr  : [0, 1, 2, 3, 3, 4, 5]
+// * length is (max_num_tokens_padded + block_size - 1) / block_size
+///
+// num_tokens_post_padded_ptr : [28]
+// num_sorted_tiles_ptr : [7]
+
+template <typename AccDataType, // you only need to explcitly set this one
+          typename Activation,  // ck_tile::element_wise::Gelu
+          typename ADataType,
+          typename GDataType,
+          typename DDataType,
+          typename ODataType,
+          typename AScaleDataType,
+          typename GScaleDataType,
+          typename DScaleDataType,
+          typename YSmoothScaleDataType,
+          typename TopkWeightDataType,
+          typename IndexDataType>
+void reference_fused_moe(
+    const ck_tile::HostTensor<ADataType>& a_host,       // [tokens, hidden_size]
+    const ck_tile::HostTensor<GDataType>& g_host,       // [experts, interme_size_0, hidden_size]
+    const ck_tile::HostTensor<DDataType>& d_host,       // [experts, hidden_size, interme_size_1]
+    const ck_tile::HostTensor<AScaleDataType>& sa_host, // [tokens, 1],
+    const ck_tile::HostTensor<GScaleDataType>& sg_host, // [experts, 1, interme_size_0]
+    const ck_tile::HostTensor<DScaleDataType>& sd_host, // [experts, 1, hidden_size],
+    const ck_tile::HostTensor<YSmoothScaleDataType>& sy_host,        // [experts, 1, interme_size_0]
+    ck_tile::HostTensor<ODataType>& o_host,                          // [tokens, hidden_size]
+    const ck_tile::HostTensor<IndexDataType>& sorted_token_ids_host, // [max_num_tokens_padded]
+    const ck_tile::HostTensor<TopkWeightDataType>& sorted_weight_host, // [max_num_tokens_padded]
+    const ck_tile::HostTensor<IndexDataType>&
+        sorted_expert_ids_host, // [(max_num_tokens_padded + block_size - 1) / block_size]
+    const ck_tile::HostTensor<IndexDataType>& num_sorted_tiles_host, // [1]
+
+    const ck_tile::HostTensor<IndexDataType>&
+        token_ids_host, // [tokens, topk] --> ugly!!! remove in the future
+
+    ck_tile::index_t block_m,
+    ck_tile::index_t tokens,
+    ck_tile::index_t experts,
+    ck_tile::index_t hidden_size,
+    ck_tile::index_t intermediate_size, // this size is for gate/up
+    ck_tile::index_t topk,
+    ck_tile::index_t gate_only)
+{
+    assert(sorted_token_ids_host.get_num_of_dimension() == 1);
+    assert(sorted_weight_host.get_num_of_dimension() == 1);
+    assert(sorted_expert_ids_host.get_num_of_dimension() == 1);
+    assert(num_sorted_tiles_host.get_element_size() == 1);
+    ck_tile::index_t num_sorted_tiles    = num_sorted_tiles_host.mData[0] / block_m;
+    ck_tile::index_t intermediate_size_0 = intermediate_size;
+    ck_tile::index_t intermediate_size_1 = intermediate_size / (gate_only ? 1 : 2);
+
+    // TODO: better remove this in the future, or modify the token_id value
+    auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
+        for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
+        {
+            if(token_ids_host(token_id_, i_) == expert_id_)
+                return i_;
+        }
+        throw std::runtime_error("not correct token/expert pair\n");
+        return -1; // TODO: not correct!!
+    };
+
+    ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size});
+
+    int max_num_tokens_padded = topk * tokens + experts * block_m - topk;
+    // assert();
+    auto f = [&](auto i_flatten) {
+        ck_tile::index_t i_tile = i_flatten / block_m;
+        if(i_tile >= num_sorted_tiles)
+            return;
+        ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
+        ck_tile::index_t i_token  = sorted_token_ids_host.mData[i_flatten];
+        if(i_token >= tokens)
+            return;
+        ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
+        auto weight             = sorted_weight_host.mData[i_flatten];
+
+        ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0});
+        // first gemm
+        for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++)
+        {
+            AccDataType acc = static_cast<AccDataType>(0);
+            for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++)
+            {
+                acc += type_convert<AccDataType>(a_host(i_token, i_k)) *
+                       type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
+            }
+            acc_0(0, i_n) = acc;
+            // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
+        }
+
+        ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
+        if(gate_only)
+        {
+            if(intermediate_size_1 != intermediate_size_0)
+                throw std::runtime_error(
+                    "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
+                    ", 1:" + std::to_string(intermediate_size_1));
+            for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
+            {
+                Activation{}(y(0, i_n), acc_0(0, i_n));
+                // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
+            }
+        }
+        else
+        {
+            if(intermediate_size_1 * 2 != intermediate_size_0)
+                throw std::runtime_error(
+                    "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
+                    ", 1:" + std::to_string(intermediate_size_1));
+            for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
+            {
+                AccDataType tmp;
+                Activation{}(tmp, acc_0(0, i_n));
+                y(0, i_n) = tmp * acc_0(0, i_n + intermediate_size_1); // TODO: elementwise mul
+            }
+        }
+
+        // second gemm, loop along gemm-n
+        ck_tile::HostTensor<AccDataType> acc_1({1, hidden_size});
+        for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
+        {
+            AccDataType acc = static_cast<AccDataType>(0);
+            for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++)
+            {
+                acc += y(0, i_k) * type_convert<AccDataType>(d_host(i_expert, i_n, i_k));
+            }
+            acc_1(0, i_n) = acc * weight; // multiple weight here
+        }
+
+        for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
+        {
+            out_topk_tokens(i_token, i_topk, i_n) = acc_1(0, i_n);
+        }
+    };
+
+    // make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);
+
+    // reduce
+    auto r = [&](auto i_token) {
+        for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
+        {
+            AccDataType acc = type_convert<AccDataType>(0);
+            for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++)
+            {
+                acc += out_topk_tokens(i_token, i_topk, i_n);
+            }
+            o_host(i_token, i_n) = type_convert<ODataType>(acc);
+        }
+    };
+    make_ParallelTensorFunctor(r, tokens)(std::thread::hardware_concurrency());
+
+    (void)num_sorted_tiles_host;
+    (void)sa_host;
+    (void)sg_host;
+    (void)sd_host;
+    (void)sy_host;
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_permute.hpp b/include/ck_tile/host/reference/reference_permute.hpp
index 14ed4f815..4e0f1a877 100644
--- a/include/ck_tile/host/reference/reference_permute.hpp
+++ b/include/ck_tile/host/reference/reference_permute.hpp
@@ -16,7 +16,7 @@ namespace ck_tile {
 */
 template <typename DataType>
 CK_TILE_HOST void
-reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::vector<index_t> dims)
+reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::vector<index_t> perm)
 {
     const auto x_len = x.mDesc.get_lengths();
     const auto y_len = y.mDesc.get_lengths();
@@ -43,7 +43,7 @@ reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::v
             std::vector<size_t> tmp(rank, 0);
             for(index_t i = 0; i < rank; i++)
             {
-                tmp[dims[i]] = y_coord[i];
+                tmp[perm[i]] = y_coord[i];
             }
             return tmp;
         }();
@@ -54,4 +54,23 @@ reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::v
 
     make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency());
 }
+
+template <typename DataType>
+CK_TILE_HOST auto reference_permute(const HostTensor<DataType>& x, std::vector<index_t> perm)
+{
+    auto x_shape                          = x.get_lengths();
+    ck_tile::index_t rank                 = perm.size();
+    std::vector<ck_tile::index_t> y_shape = [&]() {
+        std::vector<ck_tile::index_t> tmp(rank, 0);
+        for(int i = 0; i < static_cast<int>(rank); i++)
+        {
+            tmp[i] = x_shape[perm[i]];
+        }
+        return tmp;
+    }();
+
+    HostTensor<DataType> y(y_shape);
+    reference_permute(x, y, perm);
+    return y;
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 01217e16c..e24b1ba76 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -572,6 +572,105 @@ struct FastGelu
     }
 };
 
+struct FastGeluAsm
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST void operator()(Y& y, const X& x) const;
+
+    template <typename Y, typename X>
+    CK_TILE_DEVICE void operator()(Y& y, const X& x) const;
+
+    template <>
+    CK_TILE_HOST void operator()<float, float>(float& y, const float& x) const
+    {
+        // const float u   = -2.f * x * (0.035677f * x * x + 0.797885f);
+        const float c1  = -2.0 * 0.035677f;
+        const float c2  = -2.0 * 0.797885f;
+        const float u   = x * (c1 * x * x + c2);
+        const float emu = exp(u);
+        y               = x / (1.f + emu);
+    }
+
+    // device code, use lower precision "__ocml_exp_f32" and "rcp"
+    template <>
+    CK_TILE_DEVICE void operator()<float, float>(float& y, const float& x) const
+    {
+        const uint32_t c1     = 0xbd92220c; // -2.0 * 0.035677f;
+        const float c2        = -2.0 * 0.797885f;
+        const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
+        float tmp;
+
+        asm volatile("v_mul_f32 %[v_tmp], %[v_x], %[v_x]        ; x*x\n"
+                     "v_fma_f32 %[v_tmp], %[v_tmp], %[s_c1], %[v_c2]  ; c1*x*x+c2\n"
+                     "v_mul_f32 %[v_tmp], %[v_tmp], %[v_x]      ; x*(c1*x*x+c2)\n"
+                     "v_mul_f32 %[v_tmp], %[v_tmp], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
+                     "v_exp_f32 %[v_tmp], %[v_tmp]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+                     "s_nop 0                                   ; hazard for exp\n"
+                     "v_add_f32 %[v_tmp], %[v_tmp], 1.0         ; emu+1.0f\n"
+                     "v_rcp_f32 %[v_tmp], %[v_tmp]              ; 1/(emu+1.0f)\n"
+                     "s_nop 0                                   ; hazard for rcp \n"
+                     "v_mul_f32 %[v_y], %[v_tmp], %[v_x]        ; x * 1/(emu+1f)\n"
+                     : [v_y] "=v"(y), [v_tmp] "+v"(tmp)
+                     : [v_x] "v"(x), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_)
+                     :);
+    }
+
+    template <>
+    CK_TILE_HOST void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
+    {
+        const float c1   = -2.0 * 0.035677f;
+        const float c2   = -2.0 * 0.797885f;
+        const float u0   = x.x * (c1 * x.x * x.x + c2);
+        const float emu0 = exp(u0);
+        y.x              = x.x / (1.f + emu0);
+        const float u1   = x.y * (c1 * x.y * x.y + c2);
+        const float emu1 = exp(u1);
+        y.y              = x.y / (1.f + emu1);
+    }
+
+    // this is packed verion to remove data hazard for trans
+    template <>
+    CK_TILE_DEVICE void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
+    {
+        const uint32_t c1     = 0xbd92220c; // -2.0 * 0.035677f;
+        float c2              = -2.0 * 0.797885f;
+        const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
+        float tmp0, tmp1;
+        float y0 = x.x, y1 = x.y;
+
+        asm volatile(
+            "v_mul_f32 %[v_tmp0], %[v_y0], %[v_y0]        ; x*x\n"
+            "v_mul_f32 %[v_tmp1], %[v_y1], %[v_y1]        ; x*x\n"
+            "v_fma_f32 %[v_tmp0], %[v_tmp0], %[s_c1], %[v_c2]  ; c1*x*x+c2\n"
+            "v_fma_f32 %[v_tmp1], %[v_tmp1], %[s_c1], %[v_c2]  ; c1*x*x+c2\n"
+            "v_mul_f32 %[v_tmp0], %[v_tmp0], %[v_y0]      ; x*(c1*x*x+c2)\n"
+            "v_mul_f32 %[v_tmp1], %[v_tmp1], %[v_y1]      ; x*(c1*x*x+c2)\n"
+            "v_mul_f32 %[v_tmp0], %[v_tmp0], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
+            "v_mul_f32 %[v_tmp1], %[v_tmp1], %[s_log2e]  ; log2e*x*(c1*x*x+c2)\n"
+            "v_exp_f32 %[v_tmp0], %[v_tmp0]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+            "v_exp_f32 %[v_tmp1], %[v_tmp1]              ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
+            "v_add_f32 %[v_tmp0], %[v_tmp0], 1.0         ; emu+1.0f\n"
+            "v_add_f32 %[v_tmp1], %[v_tmp1], 1.0         ; emu+1.0f\n"
+            "v_rcp_f32 %[v_tmp0], %[v_tmp0]              ; 1/(emu+1.0f)\n"
+            "v_rcp_f32 %[v_tmp1], %[v_tmp1]              ; 1/(emu+1.0f)\n"
+            "v_mul_f32 %[v_y0], %[v_tmp0], %[v_y0]        ; x * 1/(emu+1f)\n"
+            "v_mul_f32 %[v_y1], %[v_tmp1], %[v_y1]        ; x * 1/(emu+1f)\n"
+            : [v_y0] "+v"(y0),
+              [v_y1] "+v"(y1),
+              [v_c2] "+v"(c2),
+              // NOTE! it is totally possible that c2/y0/y1 share same register, they are all local
+              // tmp variables we need to expicitly hint compiler they may read+write, to allow
+              // allocate different register , the side effect is c2=** may issue for every such
+              // inline asm block
+              [v_tmp0] "+v"(tmp0),
+              [v_tmp1] "+v"(tmp1)
+            : [s_c1] "s"(c1), [s_log2e] "s"(log2e_)
+            :);
+        y.x = y0;
+        y.y = y1;
+    }
+};
+
 // https://paperswithcode.com/method/gelu
 // y = 0.5*x*(1+erf(x/sqrt(2)))
 struct Gelu
diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp
new file mode 100644
index 000000000..eee80cda4
--- /dev/null
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
new file mode 100644
index 000000000..f5c7caf7d
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
+
+namespace ck_tile {
+
+// A async load to LDS, B direct to AGPR
+// B matrix preshuffled in br*kr*w
+// require 4 wave, occupancy=1c
+// agpr useage:256
+// vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112)
+//
+// for this gemm, 4 16x16x16 transposed layout
+//  input A vpgpr layout
+//   v0-v15: [ 0:15](gemm_m)x128(gemm_k)
+//  v16-v31: [16:31](gemm_m)x128(gemm_k)
+
+//  input B vpgpr layout
+//   v0-v15: [  0: 15](gemm_n)x128(gemm_k)
+//  v16-v31: [ 64: 79](gemm_n)x128(gemm_k)
+//  ......................
+//  v111-v127: [448:463](gemm_n)x128(gemm_k)
+
+//  output C vpgpr layout
+//   v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n)
+//   v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n)
+//   v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n)
+//  v12-v15: [16:31](gemm_m)x[64: 79](gemm_n)
+//  ......................
+//  v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n)
+//  v60-v63: [16:31](gemm_m)x[448:463](gemm_n)
+struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
+{
+    static constexpr index_t Block_M = 32;
+    static constexpr index_t Block_N = 512;
+    static constexpr index_t Block_K = 128;
+
+    static constexpr index_t WarpPerBlock_M = 1;
+    static constexpr index_t WarpPerBlock_N = 4;
+    static constexpr index_t WarpPerBlock_K = 1;
+
+    static constexpr index_t NumWarps = 4;
+
+    static constexpr index_t Warp_M = 16;
+    static constexpr index_t Warp_N = 16;
+    static constexpr index_t Warp_K = 32; // 16 * SubKPacks
+
+    static constexpr index_t BlockSize = 256;
+
+    static constexpr index_t SubKPacks = 2; // this is used to gurantee every threads can do dwordx4
+
+    // TODO: note Nr/Kr/W need consider SubKPacks
+    static constexpr index_t Block_W  = Warp_N * Warp_K;  // 512 element
+    static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave
+    static constexpr index_t Block_Kr = Block_K / Warp_K; // 4
+
+    static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2
+    static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8
+    static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4
+
+    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<2, 1>, // !! note here is different
+            sequence<0, 0>>{};
+
+        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution;
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        return c_block_dstr;
+    }
+
+    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
+    {
+        using CDataType             = float;
+        constexpr auto c_block_dstr = MakeCBlockDist();
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A()
+    {
+        // A async->LDS
+        // constexpr index_t Block_M = Problem::BlockShape::Block_M0;
+        // constexpr index_t Block_K = Problem::BlockShape::Block_K0;
+        // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
+        constexpr index_t warpSize = ck_tile::get_warp_size();
+        // constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
+
+        constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
+        constexpr index_t KVector = 2;      // GetAlignment_A<Problem>(); // async copy 1 dword
+        constexpr index_t KPad    = KPack_; // pad between warps
+
+        static_assert(Block_K % KVector == 0);
+        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
+        if constexpr(LanesPerK >= warpSize)
+        {
+            // need multiple waves to load K
+            static_assert(LanesPerK % warpSize == 0);
+            constexpr index_t wavesPerK = LanesPerK / warpSize;
+            if constexpr(wavesPerK > NumWarps)
+            {
+                // TODO: need multiple issues along K to load all data
+            }
+            else
+            {
+                constexpr index_t wavesPerM     = NumWarps / wavesPerK;
+                constexpr index_t NumIssues     = Block_M / wavesPerM;
+                constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                    make_tuple(number<NumIssues>{},                             // m0
+                               number<wavesPerM>{},                             // m1
+                               number<wavesPerK>{},                             // k0
+                               number<warpSize>{},                              // k1
+                               number<KVector>{}),                              // k2
+                    make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{},  // m0
+                               number<wavesPerK*(warpSize * KVector + KPad)>{}, // m1
+                               number<warpSize * KVector + KPad>{},             // k0
+                               number<KVector>{},                               // k1
+                               number<1>{}),                                    // k2
+                    number<KVector>{}, // lds store vector(actually no explicit store)
+                    number<1>{});
+
+                constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
+                    lds_block_desc_0,
+                    make_tuple(
+                        make_pass_through_transform(number<NumIssues>{}),
+                        make_merge_transform(make_tuple(number<wavesPerM>{}, number<wavesPerK>{})),
+                        make_merge_transform(make_tuple(number<warpSize>{}, number<KVector>{}))),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+                return lds_block_desc_issues_warps_lanes;
+            }
+        }
+        else
+        {
+            // lanes within a wave load different M but same K
+            static_assert(warpSize % LanesPerK == 0);
+            constexpr index_t LaneGroups = warpSize / LanesPerK; // along m
+            constexpr index_t NumIssues  = Block_M / (LaneGroups * NumWarps);
+
+            constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<NumIssues>{},                            // m0
+                           number<LaneGroups>{},                           // m1
+                           number<NumWarps>{},                             // m2
+                           number<LanesPerK>{},                            // k0
+                           number<KVector>{}),                             // k1
+                make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
+                           number<Block_K>{},                              // m1
+                           number<warpSize * KVector + KPad>{},            // m2
+                           number<KVector>{},                              // k0
+                           number<1>{}),                                   // k1
+                number<KVector>{}, // lds store vector(actually no explicit store)
+                number<1>{});
+
+            constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
+                lds_block_desc_0,
+                make_tuple(make_pass_through_transform(number<NumIssues>{}),
+                           make_pass_through_transform(number<NumWarps>{}),
+                           make_merge_transform(make_tuple(
+                               number<LaneGroups>{}, number<LanesPerK>{}, number<KVector>{}))),
+                make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+            return lds_block_desc_issues_warps_lanes;
+        }
+    }
+
+    // template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A()
+    {
+        // load from LDS to register, every wave has same layout
+        constexpr index_t KPack_ = 8;      // GetSmemKPack_A<Problem>(); // LDS
+        constexpr index_t KPad   = KPack_; // pad between warps
+
+        constexpr index_t kAMLane     = 16;
+        constexpr index_t kABKLane    = 4;
+        constexpr index_t kABKPerLane = 4;
+        constexpr index_t kKIter      = 2;
+        static_assert(KPack_ == (kABKPerLane * kKIter));
+
+        constexpr auto lds_block_desc_0 =
+            make_naive_tensor_descriptor(make_tuple(number<Repeat_M>{}, // m0 y
+                                                    number<kAMLane>{},  // m1 p
+                                                    number<Repeat_K>{}, // k0 y
+                                                    number<kABKLane>{}, // k1 p
+                                                    number<KPack_>{}),  // k2 y-vector
+                                         make_tuple(number<kAMLane*(Block_K + KPad)>{}, // m0
+                                                    number<Block_K + KPad>{},           // m1
+                                                    number<kABKLane * KPack_>{},        // k0
+                                                    number<KPack_>{},                   // k1
+                                                    number<1>{}),                       // k2
+                                         number<KPack_>{}, // lds load vector
+                                         number<1>{});
+
+        constexpr auto lds_desc_m_k = transform_tensor_descriptor(
+            lds_block_desc_0,
+            make_tuple(make_merge_transform(make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
+                       make_merge_transform(
+                           make_tuple(number<Repeat_K>{}, number<kABKLane>{}, number<KPack_>{}))),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return lds_desc_m_k;
+    }
+
+    static constexpr auto GetGemm_AWarpEnc()
+    {
+        constexpr index_t kAMLane     = 16;
+        constexpr index_t kABKLane    = 4;
+        constexpr index_t kABKPerLane = 4;
+        constexpr index_t kKIter      = 2;
+
+        using enc_ = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<kAMLane>, sequence<kABKLane, kABKPerLane * kKIter>>,
+            tuple<sequence<2, 1>>,
+            tuple<sequence<0, 0>>,
+            sequence<2>,
+            sequence<1>>;
+        return enc_{};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return 32 * (128 + 8) * sizeof(bf16_t);
+    }
+};
+
+struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base
+{
+    using ADataType = bf16_t;
+    using BDataType = bf16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    template <typename ARes, typename ACoords, typename BRes, typename BCoords>
+    CK_TILE_DEVICE auto
+    operator()(const ARes& res_a,
+               const ACoords& cached_coords_a,
+               const BRes& res_b,
+               const BCoords& cached_coords_b,
+               CK_TILE_LDS_ADDR void* smem,
+               index_t k,
+               index_t tile_offset_a, // for each tile, the offset to move for each unroll
+               index_t tile_offset_b) // for each tile, the offset to move for each unroll
+    {
+        static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8
+        static_assert(BCoords::size() == Repeat_N);
+
+        auto a_sst = make_tile_window(
+            make_tensor_view<address_space_enum::lds>(
+                reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsStoreDesc_A()),
+            MakeLdsStoreDesc_A().get_lengths(),
+            {0, 0, 0});
+
+        auto a_sld = [&]() {
+            constexpr auto a_warp_enc_      = GetGemm_AWarpEnc();
+            constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
+                sequence<WarpPerBlock_N>,
+                tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto a_block_dstr_encode =
+                detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_);
+            return make_tile_window_linear(
+                make_tensor_view<address_space_enum::lds>(
+                    reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsLoadDesc_A()),
+                MakeLdsLoadDesc_A().get_lengths(),
+                {0, 0},
+                make_static_tile_distribution(a_block_dstr_encode));
+        }();
+
+        const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType);
+        const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType);
+
+        const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst);
+        constexpr auto smem_buf_size =
+            MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType);
+        static_assert(a_sld.get_num_of_access() == 8);
+        constexpr auto sld_os = generate_tuple(
+            [&](auto i_access) {
+                return number<a_sld.get_bottom_linear_offset(i_access) * sizeof(ADataType)>{};
+            },
+            number<a_sld.get_num_of_access()>{});
+
+        index_t loop_cnt = k / Block_K;
+
+        // this is the acc thread buffer
+        fp32x4_t v_acc[16]{.0f};
+
+        // B nr->kr
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        // clang-format off
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :   [s_loop_cnt]"+s"(loop_cnt),
+                [v_acc_0]"+v"(v_acc[0]),
+                [v_acc_1]"+v"(v_acc[1]),
+                [v_acc_2]"+v"(v_acc[2]),
+                [v_acc_3]"+v"(v_acc[3]),
+                [v_acc_4]"+v"(v_acc[4]),
+                [v_acc_5]"+v"(v_acc[5]),
+                [v_acc_6]"+v"(v_acc[6]),
+                [v_acc_7]"+v"(v_acc[7]),
+                [v_acc_8]"+v"(v_acc[8]),
+                [v_acc_9]"+v"(v_acc[9]),
+                [v_acc_10]"+v"(v_acc[10]),
+                [v_acc_11]"+v"(v_acc[11]),
+                [v_acc_12]"+v"(v_acc[12]),
+                [v_acc_13]"+v"(v_acc[13]),
+                [v_acc_14]"+v"(v_acc[14]),
+                [v_acc_15]"+v"(v_acc[15]),
+                [s_mem_]"+r"(smem)
+            : [s_res_a0]"s"(res_a[0]),
+                [s_res_a1]"s"(res_a[1]),
+                [s_res_a2]"s"(res_a[2]),
+                [s_res_a3]"s"(res_a[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
+                [v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
+                [v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
+                [v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))),
+                [v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))),
+                [v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))),
+                [v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))),
+                [v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))),
+
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
+                [s_m0_init]"s"(m0_init_value),
+                [s_size_per_issue]"s"(size_per_issue),
+                [smem_sz]"n"(smem_buf_size),  //(smem_buf_size),
+                [sld_os_0]"n"(sld_os[number<0>{}].value),
+                [sld_os_1]"n"(sld_os[number<1>{}].value),
+                [sld_os_2]"n"(sld_os[number<2>{}].value),
+                [sld_os_3]"n"(sld_os[number<3>{}].value),
+                [sld_os_4]"n"(sld_os[number<4>{}].value),
+                [sld_os_5]"n"(sld_os[number<5>{}].value),
+                [sld_os_6]"n"(sld_os[number<6>{}].value),
+                [sld_os_7]"n"(sld_os[number<7>{}].value),
+                [s_tile_os_a]"s"(tile_offset_a_bytes),
+                [s_tile_os_b]"s"(tile_offset_b_bytes)
+            : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
+          "s86",    // s86 as tmp
+          "v64", "v65", "v66", "v67", "v68", "v69",
+          "v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
+          "v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
+          "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99",
+          "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
+          "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115",
+          "v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123",
+          "v124", "v125", "v126", "v127"
+        );
+        // clang-format on
+#pragma clang diagnostic pop
+
+        // return local scratch
+        auto c = MakeCBlockTile();
+        for(auto i = 0; i < 16; i++)
+        {
+            c.get_thread_buffer()[4 * i + 0] = v_acc[i].x;
+            c.get_thread_buffer()[4 * i + 1] = v_acc[i].y;
+            c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
+            c.get_thread_buffer()[4 * i + 3] = v_acc[i].w;
+        }
+        return c;
+    }
+};
+
+struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base
+{
+    using ADataType = fp16_t;
+    using BDataType = fp16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    template <typename ARes, typename ACoords, typename BRes, typename BCoords>
+    CK_TILE_DEVICE auto
+    operator()(const ARes& res_a,
+               const ACoords& cached_coords_a,
+               const BRes& res_b,
+               const BCoords& cached_coords_b,
+               CK_TILE_LDS_ADDR void* smem,
+               index_t k,
+               index_t tile_offset_a, // for each tile, the offset to move for each unroll
+               index_t tile_offset_b) // for each tile, the offset to move for each unroll
+    {
+        static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8
+        static_assert(BCoords::size() == Repeat_N);
+
+        auto a_sst = make_tile_window(
+            make_tensor_view<address_space_enum::lds>(
+                reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsStoreDesc_A()),
+            MakeLdsStoreDesc_A().get_lengths(),
+            {0, 0, 0});
+
+        auto a_sld = [&]() {
+            constexpr auto a_warp_enc_      = GetGemm_AWarpEnc();
+            constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
+                sequence<WarpPerBlock_N>,
+                tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto a_block_dstr_encode =
+                detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_);
+            return make_tile_window_linear(
+                make_tensor_view<address_space_enum::lds>(
+                    reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsLoadDesc_A()),
+                MakeLdsLoadDesc_A().get_lengths(),
+                {0, 0},
+                make_static_tile_distribution(a_block_dstr_encode));
+        }();
+
+        const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType);
+        const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType);
+
+        const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst);
+        constexpr auto smem_buf_size =
+            MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType);
+        static_assert(a_sld.get_num_of_access() == 8);
+        constexpr auto sld_os = generate_tuple(
+            [&](auto i_access) {
+                return number<a_sld.get_bottom_linear_offset(i_access) * sizeof(ADataType)>{};
+            },
+            number<a_sld.get_num_of_access()>{});
+
+        index_t loop_cnt = k / Block_K;
+
+        // this is the acc thread buffer
+        fp32x4_t v_acc[16]{.0f};
+
+        // B nr->kr
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        // clang-format off
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
+#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :   [s_loop_cnt]"+s"(loop_cnt),
+                [v_acc_0]"+v"(v_acc[0]),
+                [v_acc_1]"+v"(v_acc[1]),
+                [v_acc_2]"+v"(v_acc[2]),
+                [v_acc_3]"+v"(v_acc[3]),
+                [v_acc_4]"+v"(v_acc[4]),
+                [v_acc_5]"+v"(v_acc[5]),
+                [v_acc_6]"+v"(v_acc[6]),
+                [v_acc_7]"+v"(v_acc[7]),
+                [v_acc_8]"+v"(v_acc[8]),
+                [v_acc_9]"+v"(v_acc[9]),
+                [v_acc_10]"+v"(v_acc[10]),
+                [v_acc_11]"+v"(v_acc[11]),
+                [v_acc_12]"+v"(v_acc[12]),
+                [v_acc_13]"+v"(v_acc[13]),
+                [v_acc_14]"+v"(v_acc[14]),
+                [v_acc_15]"+v"(v_acc[15]),
+                [s_mem_]"+r"(smem)
+            : [s_res_a0]"s"(res_a[0]),
+                [s_res_a1]"s"(res_a[1]),
+                [s_res_a2]"s"(res_a[2]),
+                [s_res_a3]"s"(res_a[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
+                [v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
+                [v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
+                [v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))),
+                [v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))),
+                [v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))),
+                [v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))),
+                [v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))),
+
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
+                [s_m0_init]"s"(m0_init_value),
+                [s_size_per_issue]"s"(size_per_issue),
+                [smem_sz]"n"(smem_buf_size),  //(smem_buf_size),
+                [sld_os_0]"n"(sld_os[number<0>{}].value),
+                [sld_os_1]"n"(sld_os[number<1>{}].value),
+                [sld_os_2]"n"(sld_os[number<2>{}].value),
+                [sld_os_3]"n"(sld_os[number<3>{}].value),
+                [sld_os_4]"n"(sld_os[number<4>{}].value),
+                [sld_os_5]"n"(sld_os[number<5>{}].value),
+                [sld_os_6]"n"(sld_os[number<6>{}].value),
+                [sld_os_7]"n"(sld_os[number<7>{}].value),
+                [s_tile_os_a]"s"(tile_offset_a_bytes),
+                [s_tile_os_b]"s"(tile_offset_b_bytes)
+            : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
+          "s86",    // s86 as tmp
+          "v64", "v65", "v66", "v67", "v68", "v69",
+          "v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
+          "v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
+          "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99",
+          "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
+          "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115",
+          "v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123",
+          "v124", "v125", "v126", "v127"
+        );
+        // clang-format on
+#pragma clang diagnostic pop
+
+        // return local scratch
+        auto c = MakeCBlockTile();
+        for(auto i = 0; i < 16; i++)
+        {
+            c.get_thread_buffer()[4 * i + 0] = v_acc[i].x;
+            c.get_thread_buffer()[4 * i + 1] = v_acc[i].y;
+            c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
+            c.get_thread_buffer()[4 * i + 3] = v_acc[i].w;
+        }
+        return c;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp
new file mode 100644
index 000000000..203c87b9c
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
+
+namespace ck_tile {
+
+// "S"tream update output along "N"
+// A in smem, B load from global
+// require 4 wave, occupancy=1c
+struct FlatmmSn_32x128x512_1x4x1_16x16x32_Base
+{
+    static constexpr index_t Block_M = 32;
+    static constexpr index_t Block_N = 128;
+    static constexpr index_t Block_K = 512;
+
+    static constexpr index_t WarpPerBlock_M = 1;
+    static constexpr index_t WarpPerBlock_N = 4;
+    static constexpr index_t WarpPerBlock_K = 1;
+
+    static constexpr index_t Warp_M = 16;
+    static constexpr index_t Warp_N = 16;
+    static constexpr index_t Warp_K = 32;
+
+    static constexpr index_t BlockSize = 256;
+
+    // static constexpr index_t KPack = 2; // this is used to gurantee every threads can do dwordx4
+
+    // TODO: note Nr/Kr/W need consider KPack
+    static constexpr index_t Block_W  = Warp_N * Warp_K;  // 512 element
+    static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave
+    static constexpr index_t Block_Kr = Block_K / Warp_K; // 4
+
+    static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2
+    static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 2
+    static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 16
+
+    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<2, 1>, // !! note here is different
+            sequence<0, 0>>{};
+
+        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution;
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        return c_block_dstr;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        //                    y     y     p     p      p      y
+        // reg before shfl  M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
+        // but order is N0*M0*Nv
+        // in LDS we need store as
+        //          M0(2)* N0(2) *  Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
+        //             y    y       wave-id  lid/16  lid%16   v
+        return 2 * 2 * 4 * 4 * (16 * 4 + 4) * sizeof(bf16_t);
+    }
+};
+
+struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
+{
+    using BDataType = bf16_t;
+    using ODataType = bf16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    // template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
+    template <typename BRes,
+              typename BCoords,
+              typename ORes,
+              typename OCoords,
+              typename OFlags,
+              typename ScaleTensor>
+    CK_TILE_DEVICE auto
+    operator()(const BRes& res_b,
+               const BCoords& cached_coords_b,
+               const ORes& res_o,
+               const OCoords& cached_coords_o,
+               const OFlags& o_flags, // this should be in sgpr
+               CK_TILE_LDS_ADDR void* smem,
+               index_t n, // loop along n dim
+               const ScaleTensor& scale_,
+               index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
+               index_t tile_offset_o)
+    {
+        static_assert(BCoords::size() == 8); // 8
+        static_assert(OCoords::size() == 8);
+
+        const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
+        const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
+
+        static_assert(ScaleTensor::size() == 2);
+        float s0 = scale_[number<0>{}];
+        float s1 = scale_[number<1>{}];
+
+        index_t loop_cnt = n / Block_N;
+
+        register float v_c0 asm("v64");
+        register float v_c1 asm("v65");
+        register float v_c2 asm("v66");
+        register float v_c3 asm("v67");
+        register float v_c4 asm("v68");
+        register float v_c5 asm("v69");
+        register float v_c6 asm("v70");
+        register float v_c7 asm("v71");
+        register float v_c8 asm("v72");
+        register float v_c9 asm("v73");
+        register float v_c10 asm("v74");
+        register float v_c11 asm("v75");
+        register float v_c12 asm("v76");
+        register float v_c13 asm("v77");
+        register float v_c14 asm("v78");
+        register float v_c15 asm("v79");
+        register float v_c16 asm("v80");
+        register float v_c17 asm("v81");
+        register float v_c18 asm("v82");
+        register float v_c19 asm("v83");
+        register float v_c20 asm("v84");
+        register float v_c21 asm("v85");
+        register float v_c22 asm("v86");
+        register float v_c23 asm("v87");
+        register float v_c24 asm("v88");
+        register float v_c25 asm("v89");
+        register float v_c26 asm("v90");
+        register float v_c27 asm("v91");
+        register float v_c28 asm("v92");
+        register float v_c29 asm("v93");
+        register float v_c30 asm("v94");
+        register float v_c31 asm("v95");
+        int32_t nan_hi = 0x7fff0000;
+        int32_t nan_lo = 0x00007fff;
+
+        // in smem, the layout is  M0(2)*K0(128)*M1(16)*K1(4)
+        // every threads need 8xK in contiguous register
+        // ... and every wave need the same data
+        int lane_id  = threadIdx.x % 64;
+        int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
+        sld_y_os *= 2;
+
+        //                    y     y     p     p      p      y
+        // reg before shfl  M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
+        // but order is N0*M0*Nv
+        // in LDS we need store as
+        //          M0(2)* N0(2) *  Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
+        //             y    y       wave-id  lid/16  lid%16   v
+        // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
+        int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
+        sfl_sst *= 2;
+
+        // from LDS we need load as
+        //          M0(2)*    N0(2) *  Nl(4) * Nw(4) * (Mw(16)         *  Nv(4) + 4)
+        //        ( 2 issue)    (rem 32-lane)        (4 wave*4issue)   2lane*1ussue(pk2)
+        // sld(v4) = v0/2 *34*4  + v0 % 2 *4 + wid*2 *4
+        int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
+        sfl_sld *= 2;
+
+        // B nr->kr
+        // clang-format off
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :[smem_]"+r"(smem),
+            [s_loop_cnt]"+s"(loop_cnt),
+                [c0]"+v" (v_c0),
+                [c1]"+v" (v_c1),
+                [c2]"+v" (v_c2),
+                [c3]"+v" (v_c3),
+                [c4]"+v" (v_c4),
+                [c5]"+v" (v_c5),
+                [c6]"+v" (v_c6),
+                [c7]"+v" (v_c7),
+                [c8]"+v" (v_c8),
+                [c9]"+v" (v_c9),
+                [c10]"+v"(v_c10),
+                [c11]"+v"(v_c11),
+                [c12]"+v"(v_c12),
+                [c13]"+v"(v_c13),
+                [c14]"+v"(v_c14),
+                [c15]"+v"(v_c15),
+                [c16]"+v"(v_c16),
+                [c17]"+v"(v_c17),
+                [c18]"+v"(v_c18),
+                [c19]"+v"(v_c19),
+                [c20]"+v"(v_c20),
+                [c21]"+v"(v_c21),
+                [c22]"+v"(v_c22),
+                [c23]"+v"(v_c23),
+                [c24]"+v"(v_c24),
+                [c25]"+v"(v_c25),
+                [c26]"+v"(v_c26),
+                [c27]"+v"(v_c27),
+                [c28]"+v"(v_c28),
+                [c29]"+v"(v_c29),
+                [c30]"+v"(v_c30),
+                [c31]"+v"(v_c31)
+            :
+            [sld_a_base]"n"(0),
+            [shfl_base]"n"(0),
+            [v_sld_y_os]"v"(sld_y_os),
+            [v_sfl_sld]"v"(sfl_sld),
+            [v_sfl_sst]"v"(sfl_sst),
+            [s_res_o0]"s"(res_o[0]),
+                [s_res_o1]"s"(res_o[1]),
+                //[s_res_o2]"s"(res_o[2]),
+                //[s_res_o3]"s"(res_o[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
+                [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
+                [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
+                [v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
+                [v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
+                [v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
+                [v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
+                [v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [s_tile_os_o]"s"(tile_stride_o_bytes),
+                [s_tile_os_b]"s"(tile_stride_b_bytes),
+                [scale_0]"v"(s0),
+                [scale_1]"v"(s1),
+                [v_nan_lo]"v"(nan_lo),
+                [v_nan_hi]"v"(nan_hi),
+                [s_execflag_0]"s"(o_flags[number<0>{}]),
+                [s_execflag_1]"s"(o_flags[number<1>{}]),
+                [s_execflag_2]"s"(o_flags[number<2>{}]),
+                [s_execflag_3]"s"(o_flags[number<3>{}]),
+                [s_execflag_4]"s"(o_flags[number<4>{}]),
+                [s_execflag_5]"s"(o_flags[number<5>{}]),
+                [s_execflag_6]"s"(o_flags[number<6>{}]),
+                [s_execflag_7]"s"(o_flags[number<7>{}])
+            :
+          "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
+           "s36", "s37",
+          "v50", "v54", "v55",
+          "v64","v65","v66","v67","v68","v69","v70","v71",
+          "v72","v73","v74","v75","v76","v77","v78","v79",
+          "v80","v81","v82","v83","v84","v85","v86","v87",
+          "v88","v89","v90","v91","v92","v93","v94","v95",
+          "v128", "v129", "v130", "v131",
+          "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
+          "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
+          "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
+          "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
+          "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
+          "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+          "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
+          "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
+          "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
+          "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
+          "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
+          "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
+          "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
+          "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
+          "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+          "v252", "v253", "v254", "v255"
+        );
+#pragma clang diagnostic pop
+        // clang-format on
+    }
+};
+
+struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
+{
+    using BDataType = bf16_t;
+    using ODataType = bf16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    // template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
+    template <typename BRes,
+              typename BCoords,
+              typename ORes,
+              typename OCoords,
+              typename OFlags,
+              typename ScaleTensor>
+    CK_TILE_DEVICE auto
+    operator()(const BRes& res_b,
+               const BCoords& cached_coords_b,
+               const ORes& res_o,
+               const OCoords& cached_coords_o,
+               const OFlags& o_flags, // this should be in sgpr
+               CK_TILE_LDS_ADDR void* smem,
+               index_t n, // loop along n dim
+               const ScaleTensor& scale_,
+               index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
+               index_t tile_offset_o)
+    {
+        static_assert(BCoords::size() == 8); // 8
+        static_assert(OCoords::size() == 8);
+
+        const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
+        const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
+
+        static_assert(ScaleTensor::size() == 2);
+        float s0 = scale_[number<0>{}];
+        float s1 = scale_[number<1>{}];
+
+        index_t loop_cnt = n / Block_N;
+
+        register float v_c0 asm("v64");
+        register float v_c1 asm("v65");
+        register float v_c2 asm("v66");
+        register float v_c3 asm("v67");
+        register float v_c4 asm("v68");
+        register float v_c5 asm("v69");
+        register float v_c6 asm("v70");
+        register float v_c7 asm("v71");
+        register float v_c8 asm("v72");
+        register float v_c9 asm("v73");
+        register float v_c10 asm("v74");
+        register float v_c11 asm("v75");
+        register float v_c12 asm("v76");
+        register float v_c13 asm("v77");
+        register float v_c14 asm("v78");
+        register float v_c15 asm("v79");
+        register float v_c16 asm("v80");
+        register float v_c17 asm("v81");
+        register float v_c18 asm("v82");
+        register float v_c19 asm("v83");
+        register float v_c20 asm("v84");
+        register float v_c21 asm("v85");
+        register float v_c22 asm("v86");
+        register float v_c23 asm("v87");
+        register float v_c24 asm("v88");
+        register float v_c25 asm("v89");
+        register float v_c26 asm("v90");
+        register float v_c27 asm("v91");
+        register float v_c28 asm("v92");
+        register float v_c29 asm("v93");
+        register float v_c30 asm("v94");
+        register float v_c31 asm("v95");
+        int32_t nan_hi = 0x7fff0000;
+        int32_t nan_lo = 0x00007fff;
+
+        // in smem, the layout is  M0(2)*K0(128)*M1(16)*K1(4)
+        // every threads need 8xK in contiguous register
+        // ... and every wave need the same data
+        int lane_id  = threadIdx.x % 64;
+        int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
+        sld_y_os *= 2;
+
+        //                    y     y     p     p      p      y
+        // reg before shfl  M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
+        // but order is N0*M0*Nv
+        // in LDS we need store as
+        //          M0(2)* N0(2) *  Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
+        //             y    y       wave-id  lid/16  lid%16   v
+        // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
+        int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
+        sfl_sst *= 2;
+
+        // from LDS we need load as
+        //          M0(2)*    N0(2) *  Nl(4) * Nw(4) * (Mw(16)         *  Nv(4) + 4)
+        //        ( 2 issue)    (rem 32-lane)        (4 wave*4issue)   2lane*1ussue(pk2)
+        // sld(v4) = v0/2 *34*4  + v0 % 2 *4 + wid*2 *4
+        int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
+        sfl_sld *= 2;
+
+        // B nr->kr
+        // clang-format off
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
+#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :[smem_]"+r"(smem),
+            [s_loop_cnt]"+s"(loop_cnt),
+                [c0]"+v" (v_c0),
+                [c1]"+v" (v_c1),
+                [c2]"+v" (v_c2),
+                [c3]"+v" (v_c3),
+                [c4]"+v" (v_c4),
+                [c5]"+v" (v_c5),
+                [c6]"+v" (v_c6),
+                [c7]"+v" (v_c7),
+                [c8]"+v" (v_c8),
+                [c9]"+v" (v_c9),
+                [c10]"+v"(v_c10),
+                [c11]"+v"(v_c11),
+                [c12]"+v"(v_c12),
+                [c13]"+v"(v_c13),
+                [c14]"+v"(v_c14),
+                [c15]"+v"(v_c15),
+                [c16]"+v"(v_c16),
+                [c17]"+v"(v_c17),
+                [c18]"+v"(v_c18),
+                [c19]"+v"(v_c19),
+                [c20]"+v"(v_c20),
+                [c21]"+v"(v_c21),
+                [c22]"+v"(v_c22),
+                [c23]"+v"(v_c23),
+                [c24]"+v"(v_c24),
+                [c25]"+v"(v_c25),
+                [c26]"+v"(v_c26),
+                [c27]"+v"(v_c27),
+                [c28]"+v"(v_c28),
+                [c29]"+v"(v_c29),
+                [c30]"+v"(v_c30),
+                [c31]"+v"(v_c31)
+            :
+            [sld_a_base]"n"(0),
+            [shfl_base]"n"(0),
+            [v_sld_y_os]"v"(sld_y_os),
+            [v_sfl_sld]"v"(sfl_sld),
+            [v_sfl_sst]"v"(sfl_sst),
+            [s_res_o0]"s"(res_o[0]),
+                [s_res_o1]"s"(res_o[1]),
+                //[s_res_o2]"s"(res_o[2]),
+                //[s_res_o3]"s"(res_o[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
+                [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
+                [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
+                [v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
+                [v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
+                [v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
+                [v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
+                [v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [s_tile_os_o]"s"(tile_stride_o_bytes),
+                [s_tile_os_b]"s"(tile_stride_b_bytes),
+                [scale_0]"v"(s0),
+                [scale_1]"v"(s1),
+                [v_nan_lo]"v"(nan_lo),
+                [v_nan_hi]"v"(nan_hi),
+                [s_execflag_0]"s"(o_flags[number<0>{}]),
+                [s_execflag_1]"s"(o_flags[number<1>{}]),
+                [s_execflag_2]"s"(o_flags[number<2>{}]),
+                [s_execflag_3]"s"(o_flags[number<3>{}]),
+                [s_execflag_4]"s"(o_flags[number<4>{}]),
+                [s_execflag_5]"s"(o_flags[number<5>{}]),
+                [s_execflag_6]"s"(o_flags[number<6>{}]),
+                [s_execflag_7]"s"(o_flags[number<7>{}])
+            :
+          "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
+           "s36", "s37",
+          "v50", "v54", "v55",
+          "v64","v65","v66","v67","v68","v69","v70","v71",
+          "v72","v73","v74","v75","v76","v77","v78","v79",
+          "v80","v81","v82","v83","v84","v85","v86","v87",
+          "v88","v89","v90","v91","v92","v93","v94","v95",
+          "v128", "v129", "v130", "v131",
+          "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
+          "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
+          "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
+          "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
+          "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
+          "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+          "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
+          "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
+          "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
+          "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
+          "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
+          "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
+          "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
+          "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
+          "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+          "v252", "v253", "v254", "v255"
+        );
+#pragma clang diagnostic pop
+        // clang-format on
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp b/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp
new file mode 100644
index 000000000..003335c0e
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/flatmm_uk_config.hpp
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#define CK_TILE_FLATMM_UK_MFMA_FP16 0
+#define CK_TILE_FLATMM_UK_MFMA_BF16 1
+#define CK_TILE_FLATMM_UK_MFMA_INT8 2
+#define CK_TILE_FLATMM_UK_MFMA_FP8 3
+#define CK_TILE_FLATMM_UK_MFMA_BF8 4
diff --git a/include/ck_tile/ops/flatmm/block/uk/README.md b/include/ck_tile/ops/flatmm/block/uk/README.md
new file mode 100644
index 000000000..84fa13229
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/uk/README.md
@@ -0,0 +1 @@
+the files under this folder should not be included directly!
\ No newline at end of file
diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
new file mode 100644
index 000000000..8b57611f0
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
@@ -0,0 +1,613 @@
+#ifndef CK_TILE_FLATMM_UK_MFMA
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#endif
+
+#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
+#   define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
+
+#   define _UK_PK_CVT_(x0_, x1_, y_)                           \
+    "  v_cmp_u_f32   s[36:37], " x0_ ", " x0_ "          \n"   \
+    "  v_add3_u32    v50, " x0_ ", %[v_nan_lo], 1        \n"   \
+    "  v_cndmask_b32  v54, v50, %[v_nan_hi], s[36:37]    \n"   \
+    "  v_cmp_u_f32   s[36:37], " x1_ ", " x1_ "          \n"   \
+    "  v_add3_u32    v50, " x1_ ", %[v_nan_lo], 1        \n"   \
+    "  v_cndmask_b32  v55, v50, %[v_nan_hi], s[36:37]    \n"   \
+    "  v_perm_b32    " y_ ", v55, v54, s52               \n"
+
+#   define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
+
+#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
+#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
+
+#   define _UK_PK_CVT_(x0_, x1_, y_)                \
+    "  v_cvt_f16_f32  v54, " x0_ "  \n"             \
+    "  v_cvt_f16_f32  v55, " x1_ "  \n"             \
+    "  v_pack_b32_f16 " y_ ", v54, v55  \n"
+
+#   define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
+
+#endif
+
+
+";-------------------------------------------------------------\n"
+" s_mov_b32 s52, 0x07060302 ; v_perm\n"
+" s_mov_b64 s[38:39], exec ; save current exec\n"
+" s_mov_b32 s8,    %[s_res_o0] \n"
+" s_mov_b32 s9,    %[s_res_o1] \n"
+" s_mov_b32 s12,    %[s_res_b0] \n"
+" s_mov_b32 s13,    %[s_res_b1] \n"
+" s_mov_b32 s14,    %[s_res_b2] \n"
+" s_mov_b32 s15,    %[s_res_b3] \n"
+" ds_read_b64   v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]                       \n"
+" ds_read_b64   v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]                     \n"
+" ds_read_b64   v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]                    \n"
+" ds_read_b64   v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]                    \n"
+" ds_read_b64   v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]                    \n"
+" ds_read_b64   v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]                    \n"
+" ds_read_b64   v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]                    \n"
+" ds_read_b64   v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]                    \n"
+" ds_read_b64   v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]                    \n"
+" ds_read_b64   v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]                    \n"
+" ds_read_b64   v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]                    \n"
+" ds_read_b64   v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]                    \n"
+" ds_read_b64   v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]                    \n"
+" ds_read_b64   v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]                    \n"
+" ds_read_b64   v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]                    \n"
+" ds_read_b64   v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]                    \n"
+" ds_read_b64   v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]                    \n"
+" ds_read_b64   v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]                    \n"
+" ds_read_b64   v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]                    \n"
+" ds_read_b64   v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]                    \n"
+" ds_read_b64   v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]                    \n"
+" ds_read_b64   v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]                    \n"
+" ds_read_b64   v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]                    \n"
+" ds_read_b64   v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]                    \n"
+" ds_read_b64   v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]                    \n"
+" ds_read_b64   v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]                    \n"
+" ds_read_b64   v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]                    \n"
+" ds_read_b64   v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]                    \n"
+" ds_read_b64   v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]                    \n"
+" ds_read_b64   v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]                    \n"
+" ds_read_b64   v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]                    \n"
+" ds_read_b64   v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]                    \n"
+" ds_read_b64   v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]                    \n"
+" ds_read_b64   v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]                    \n"
+" ds_read_b64   v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]                    \n"
+" ds_read_b64   v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]                    \n"
+" ds_read_b64   v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]                    \n"
+" ds_read_b64   v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]                    \n"
+" ds_read_b64   v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]                    \n"
+" ds_read_b64   v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]                    \n"
+" ds_read_b64   v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]                    \n"
+" ds_read_b64   v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]                    \n"
+" ds_read_b64   v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]                    \n"
+" ds_read_b64   v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]                    \n"
+" ds_read_b64   v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]                    \n"
+" ds_read_b64   v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]                    \n"
+" ds_read_b64   v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]                    \n"
+" ds_read_b64   v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]                    \n"
+" ds_read_b64   v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]                    \n"
+" ds_read_b64   v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]                    \n"
+" ds_read_b64   v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]                    \n"
+" ds_read_b64   v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]                    \n"
+" ds_read_b64   v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]                    \n"
+" ds_read_b64   v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]                    \n"
+" ds_read_b64   v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]                    \n"
+" ds_read_b64   v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]                    \n"
+" ds_read_b64   v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]                    \n"
+" ds_read_b64   v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]                    \n"
+" ds_read_b64   v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]                    \n"
+" ds_read_b64   v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]                    \n"
+" ds_read_b64   v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]                    \n"
+" ds_read_b64   v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]                    \n"
+" ds_read_b64   v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]                    \n"
+" ds_read_b64   v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]                    \n"
+"  s_waitcnt 0                    \n"
+"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+"  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+"  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+"  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+"  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
+"  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+"  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+"  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+"  s_add_u32     s12, s86, s12                                  \n"
+"  s_addc_u32    s13, 0, s13                                    \n"
+"  s_waitcnt 0                    \n"
+"L_start%=:                    \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+"  s_barrier                                             \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
+"  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n"
+"  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
+"  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n"
+"  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
+"  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n"
+"  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
+"  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n"
+"  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n"
+"  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
+"  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
+"  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n"
+"  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n"
+"  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n"
+"  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n"
+"  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n"
+"  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n"
+"  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n"
+"  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n"
+"  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n"
+"  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n"
+"  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
+"  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
+"  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n"
+"  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n"
+"  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
+"  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
+_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]\n"
+"  v_mul_f32     %[c0], %[scale_0], %[c0]                            \n"
+"  v_mul_f32     %[c1], %[scale_0], %[c1]                            \n"
+"  v_mul_f32     %[c2], %[scale_0], %[c2]                            \n"
+"  v_mul_f32     %[c3], %[scale_0], %[c3]                            \n"
+"  v_mul_f32     %[c4], %[scale_1], %[c4]                            \n"
+"  v_mul_f32     %[c5], %[scale_1], %[c5]                            \n"
+"  v_mul_f32     %[c6], %[scale_1], %[c6]                            \n"
+"  v_mul_f32     %[c7], %[scale_1], %[c7]                            \n"
+"  v_mul_f32     %[c8], %[scale_0], %[c8]                            \n"
+"  v_mul_f32     %[c9], %[scale_0], %[c9]                            \n"
+"  v_mul_f32     %[c10], %[scale_0], %[c10]                            \n"
+"  v_mul_f32     %[c11], %[scale_0], %[c11]                            \n"
+"  v_mul_f32     %[c12], %[scale_1], %[c12]                            \n"
+"  v_mul_f32     %[c13], %[scale_1], %[c13]                            \n"
+"  v_mul_f32     %[c14], %[scale_1], %[c14]                            \n"
+"  v_mul_f32     %[c15], %[scale_1], %[c15]                            \n"
+_UK_PK_CVT_("%[c0]",  "%[c1]",  "%[c0]")
+_UK_PK_CVT_("%[c2]",  "%[c3]",  "%[c1]")
+_UK_PK_CVT_("%[c4]",  "%[c5]",  "%[c2]")
+_UK_PK_CVT_("%[c6]",  "%[c7]",  "%[c3]")
+_UK_PK_CVT_("%[c8]",  "%[c9]",  "%[c4]")
+_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
+_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
+_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]")
+"  ;------------------------------  \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c0],%[c1]] offset:0    + %[shfl_base]               \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base]               \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base]               \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base]               \n"
+"  s_waitcnt     lgkmcnt(0)                              \n"
+"  s_barrier                                             \n"
+"  ds_read_b32   %[c0], %[v_sfl_sld] offset:0    + %[shfl_base]                    \n"
+"  ds_read_b32   %[c1], %[v_sfl_sld] offset:32   + %[shfl_base]                    \n"
+"  ds_read_b32   %[c2], %[v_sfl_sld] offset:64   + %[shfl_base]                    \n"
+"  ds_read_b32   %[c3], %[v_sfl_sld] offset:96   + %[shfl_base]                    \n"
+"  ds_read_b32   %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base]                    \n"
+"  ds_read_b32   %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base]                    \n"
+"  ds_read_b32   %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base]                    \n"
+"  ds_read_b32   %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base]                    \n"
+"  s_waitcnt     lgkmcnt(0)                              \n"
+"  s_mov_b64     exec, %[s_execflag_0]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o0], %[c0], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_1]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o1], %[c1], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_2]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o2], %[c2], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_3]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o3], %[c3], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_4]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o4], %[c4], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_5]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o5], %[c5], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_6]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o6], %[c6], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_7]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o7], %[c7], s[8:9]  \n"
+"  s_mov_b64     exec, s[38:39]                           \n"
+"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1     ; k--      \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+"  s_cbranch_scc0 L_end%=                                       \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+"  s_add_u32     s12, s86, s12                                  \n"
+"  s_addc_u32    s13, 0, s13                                    \n"
+"  s_add_u32     s8, %[s_tile_os_o], s8                             \n"
+"  s_addc_u32    s9, 0, s9                               \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+"  s_barrier                                             \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n"
+"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]] \n"
+"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n"
+"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]] \n"
+"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n"
+"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]] \n"
+"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n"
+"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]] \n"
+"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]] \n"
+"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]] \n"
+"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]] \n"
+"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]] \n"
+"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]] \n"
+"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]] \n"
+"  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]] \n"
+"  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]] \n"
+"  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]] \n"
+"  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]] \n"
+"  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]] \n"
+"  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]] \n"
+"  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]] \n"
+"  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]] \n"
+"  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]] \n"
+"  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]] \n"
+"  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]] \n"
+"  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]] \n"
+"  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]] \n"
+_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]\n"
+"  v_mul_f32     %[c16], %[scale_0], %[c16]                            \n"
+"  v_mul_f32     %[c17], %[scale_0], %[c17]                            \n"
+"  v_mul_f32     %[c18], %[scale_0], %[c18]                            \n"
+"  v_mul_f32     %[c19], %[scale_0], %[c19]                            \n"
+"  v_mul_f32     %[c20], %[scale_1], %[c20]                            \n"
+"  v_mul_f32     %[c21], %[scale_1], %[c21]                            \n"
+"  v_mul_f32     %[c22], %[scale_1], %[c22]                            \n"
+"  v_mul_f32     %[c23], %[scale_1], %[c23]                            \n"
+"  v_mul_f32     %[c24], %[scale_0], %[c24]                            \n"
+"  v_mul_f32     %[c25], %[scale_0], %[c25]                            \n"
+"  v_mul_f32     %[c26], %[scale_0], %[c26]                            \n"
+"  v_mul_f32     %[c27], %[scale_0], %[c27]                            \n"
+"  v_mul_f32     %[c28], %[scale_1], %[c28]                            \n"
+"  v_mul_f32     %[c29], %[scale_1], %[c29]                            \n"
+"  v_mul_f32     %[c30], %[scale_1], %[c30]                            \n"
+"  v_mul_f32     %[c31], %[scale_1], %[c31]                            \n"
+
+_UK_PK_CVT_("%[c16]",  "%[c17]",  "%[c16]")
+_UK_PK_CVT_("%[c18]",  "%[c19]",  "%[c17]")
+_UK_PK_CVT_("%[c20]",  "%[c21]",  "%[c18]")
+_UK_PK_CVT_("%[c22]",  "%[c23]",  "%[c19]")
+_UK_PK_CVT_("%[c24]",  "%[c25]",  "%[c20]")
+_UK_PK_CVT_("%[c26]",  "%[c27]",  "%[c21]")
+_UK_PK_CVT_("%[c28]",  "%[c29]",  "%[c22]")
+_UK_PK_CVT_("%[c30]",  "%[c31]",  "%[c23]")
+
+"  ;------------------------------  \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c16],%[c17]] offset:0    + %[shfl_base]         \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]         \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]         \n"
+"  ds_write_b64  %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]         \n"
+"  s_waitcnt     lgkmcnt(0)                              \n"
+"  s_barrier                                             \n"
+"  ds_read_b32   %[c16], %[v_sfl_sld] offset:0    + %[shfl_base]                  \n"
+"  ds_read_b32   %[c17], %[v_sfl_sld] offset:32   + %[shfl_base]                  \n"
+"  ds_read_b32   %[c18], %[v_sfl_sld] offset:64   + %[shfl_base]                  \n"
+"  ds_read_b32   %[c19], %[v_sfl_sld] offset:96   + %[shfl_base]                  \n"
+"  ds_read_b32   %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]                  \n"
+"  ds_read_b32   %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]                  \n"
+"  ds_read_b32   %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]                  \n"
+"  ds_read_b32   %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]                  \n"
+"  s_waitcnt     lgkmcnt(0)                              \n"
+"  s_mov_b64     exec, %[s_execflag_0]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o0], %[c16], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_1]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o1], %[c17], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_2]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o2], %[c18], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_3]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o3], %[c19], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_4]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o4], %[c20], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_5]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o5], %[c21], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_6]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o6], %[c22], s[8:9]  \n"
+"  s_mov_b64     exec, %[s_execflag_7]                    \n"
+_UK_ATOMIC_ADD_ "   %[v_os_o7], %[c23], s[8:9]  \n"
+"  s_mov_b64     exec, s[38:39]                           \n"
+"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1     ; k--      \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+"  s_cbranch_scc0 L_end%=                                       \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+"  s_add_u32     s12, s86, s12                                  \n"
+"  s_addc_u32    s13, 0, s13                                    \n"
+"  s_add_u32     s8, %[s_tile_os_o], s8                             \n"
+"  s_addc_u32    s9, 0, s9                               \n"
+"  s_branch      L_start%=          \n"
+"L_end%=:                                                \n"
+
+#undef _UK_MFMA_
+#undef _UK_PK_CVT_
+#undef _UK_ATOMIC_ADD_
diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
new file mode 100644
index 000000000..a34a21d39
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
@@ -0,0 +1,516 @@
+#ifndef CK_TILE_FLATMM_UK_MFMA
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#endif
+
+#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
+#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
+#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
+#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
+#endif
+
+"s_mov_b32 s16,    %[s_res_a0] \n"
+"s_mov_b32 s17,    %[s_res_a1] \n"
+"s_mov_b32 s18,    %[s_res_a2] \n"
+"s_mov_b32 s19,    %[s_res_a3] \n"
+"s_mov_b32 s20,    %[s_res_b0] \n"
+"s_mov_b32 s21,    %[s_res_b1] \n"
+"s_mov_b32 s22,    %[s_res_b2] \n"
+"s_mov_b32 s23,    %[s_res_b3] \n"
+// "s_nop  4\n"
+"; -- prefetch A0\n"
+"s_add_u32     m0, 0, %[s_m0_init]                        \n"
+"buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                \n"
+"buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                \n"
+"buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                \n"
+"buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                \n"
+"buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                \n"
+"buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                \n"
+"buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                \n"
+"buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
+"s_add_u32 m0, %[smem_sz], %[s_m0_init]                       \n"
+"s_cmp_gt_i32  %[s_loop_cnt] 1             ; move a with cond \n"
+"s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
+"s_add_u32     s16, s86, s16               ; move a with cond \n"
+"s_addc_u32    s17, 0, s17                 ; move a with cond \n"
+"; -- prefetch A1\n"
+"buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+"buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+"buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+"buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+"buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+"buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+"buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
+"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+"buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
+"s_add_u32 m0, 0, %[s_m0_init]                                \n"
+"s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+"s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
+"s_add_u32     s16, s86, s16               ; move a with cond \n"
+"s_addc_u32    s17, 0, s17                 ; move a with cond \n"
+"; -- prefetch B0\n"
+"buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
+"buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024  \n"
+"buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048  \n"
+"buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072  \n"
+"buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen  \n"
+"buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024  \n"
+"buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048  \n"
+"buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072  \n"
+"buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen  \n"
+"buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024  \n"
+"buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048  \n"
+"buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072  \n"
+"buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen  \n"
+"buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024    \n"
+"buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048    \n"
+"buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072    \n"
+"buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen                \n"
+"buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024    \n"
+"buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048    \n"
+"buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072    \n"
+"buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen                \n"
+"buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024    \n"
+"buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048    \n"
+"buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072    \n"
+"buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen                \n"
+"buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024  \n"
+"buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048  \n"
+"buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072  \n"
+"buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen              \n"
+"buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024  \n"
+"buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048  \n"
+"buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072  \n"
+"s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+"s_cselect_b32 s86, %[s_tile_os_b], 0      ; move b with cond \n"
+"s_add_u32     s20, s86, s20               ; move b with cond \n"
+"s_addc_u32    s21, 0, s21                 ; move b with cond \n"
+"s_waitcnt     vmcnt(40)                        \n"
+"s_barrier                                      \n"
+"ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n"    // 1024: N stride, 64 K stride
+"ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
+"ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
+"ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
+"ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
+"ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
+"ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
+"ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
+"L_start%=:                                                         \n"
+"  s_waitcnt     vmcnt(24) & lgkmcnt(0)                             \n"
+"  s_barrier                                                        \n"
+_UK_MFMA_ "  %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0] \n"
+"  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0] \n"
+"  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0] \n"
+"  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0] \n"
+"  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1] \n"
+"  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1] \n"
+"  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1] \n"
+"  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1] \n"
+"  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2] \n"
+"  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2] \n"
+"  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2] \n"
+"  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2] \n"
+"  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3] \n"
+"  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3] \n"
+"  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3] \n"
+"  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3] \n"
+"  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[smem_sz], %[s_m0_init]                  \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4] \n"
+"  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4] \n"
+"  ds_read_b128  v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]                \n"
+_UK_MFMA_ "  %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4] \n"
+"  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4] \n"
+"  ds_read_b128  v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]              \n"
+_UK_MFMA_ "  %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5] \n"
+"  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5] \n"
+"  ds_read_b128  v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]              \n"
+_UK_MFMA_ "  %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5] \n"
+"  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5] \n"
+"  ds_read_b128  v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]              \n"
+_UK_MFMA_ "  %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6] \n"
+"  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6] \n"
+"  ds_read_b128  v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]              \n"
+_UK_MFMA_ "  %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6] \n"
+"  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6] \n"
+"  ds_read_b128  v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]              \n"
+_UK_MFMA_ "  %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7] \n"
+"  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7] \n"
+"  ds_read_b128  v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]              \n"
+_UK_MFMA_ "  %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7] \n"
+"  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7] \n"
+"  ds_read_b128  v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]              \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8] \n"
+"  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8] \n"
+"  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9] \n"
+"  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9] \n"
+"  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10] \n"
+"  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10] \n"
+"  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11] \n"
+"  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11] \n"
+"  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12] \n"
+"  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12] \n"
+"  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13] \n"
+"  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13] \n"
+"  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14] \n"
+"  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14] \n"
+"  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15] \n"
+"  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15] \n"
+"  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072\n"
+_UK_MFMA_ "  %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15] \n"
+"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+"  s_cbranch_scc0 L_end%=                                       \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+"  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
+"  s_add_u32     s16, s86, s16                                  \n"
+"  s_addc_u32    s17, 0, s17                                    \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+"  s_add_u32     s20, s86, s20                                  \n"
+"  s_addc_u32    s21, 0, s21                                    \n"
+"  ;------------------------------------------                  \n"
+"  s_waitcnt     vmcnt(24) & lgkmcnt(0)                  \n"
+"  s_barrier                                             \n"
+_UK_MFMA_ "  %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0] \n"
+"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0] \n"
+"  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0] \n"
+"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0] \n"
+_UK_MFMA_ "  %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0] \n"
+"  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1] \n"
+"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1] \n"
+"  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1] \n"
+"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1] \n"
+_UK_MFMA_ "  %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1] \n"
+"  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2] \n"
+"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2] \n"
+"  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2] \n"
+"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2] \n"
+_UK_MFMA_ "  %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2] \n"
+"  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3] \n"
+"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3] \n"
+"  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+_UK_MFMA_ "  %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3] \n"
+"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3] \n"
+_UK_MFMA_ "  %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3] \n"
+"  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
+"  s_add_u32     m0, 0, %[s_m0_init]                  \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4] \n"
+"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4] \n"
+"  ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]  \n"
+_UK_MFMA_ "  %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4] \n"
+"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4] \n"
+_UK_MFMA_ "  %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4] \n"
+"  ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]  \n"
+_UK_MFMA_ "  %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5] \n"
+"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5] \n"
+"  ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]                 \n"
+_UK_MFMA_ "  %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5] \n"
+"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5] \n"
+_UK_MFMA_ "  %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5] \n"
+"  ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]                \n"
+_UK_MFMA_ "  %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6] \n"
+"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6] \n"
+"  ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]               \n"
+_UK_MFMA_ "  %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6] \n"
+"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6] \n"
+_UK_MFMA_ "  %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6] \n"
+"  ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]            \n"
+_UK_MFMA_ "  %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7] \n"
+"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7] \n"
+"  ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]                \n"
+_UK_MFMA_ "  %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7] \n"
+"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7] \n"
+_UK_MFMA_ "  %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7] \n"
+"  ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]           \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8] \n"
+"  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8] \n"
+"  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9] \n"
+"  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9] \n"
+"  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10] \n"
+"  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10] \n"
+"  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11] \n"
+"  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11] \n"
+"  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11] \n"
+_UK_MFMA_ "  %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11] \n"
+"  s_waitcnt     vmcnt(32)                               \n"
+_UK_MFMA_ "  %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12] \n"
+"  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12] \n"
+"  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13] \n"
+"  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13] \n"
+"  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14] \n"
+"  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen \n"
+_UK_MFMA_ "  %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14] \n"
+"  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
+_UK_MFMA_ "  %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15] \n"
+"  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
+_UK_MFMA_ "  %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15] \n"
+"  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n"
+_UK_MFMA_ "  %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15] \n"
+_UK_MFMA_ "  %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15] \n"
+"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+"  s_cbranch_scc0 L_end%=                                       \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+"  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
+"  s_add_u32     s16, s86, s16                                  \n"
+"  s_addc_u32    s17, 0, s17                                    \n"
+"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+"  s_add_u32     s20, s86, s20                                  \n"
+"  s_addc_u32    s21, 0, s21                                    \n"
+"  s_branch     L_start%=                                       \n"
+"L_end%=:                                                       \n"
+"  s_nop 2                                                      \n"
+
+#undef _UK_MFMA_
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index 10bb01168..173887513 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -331,7 +331,8 @@ struct BlockFmhaPipelineQRKSVSAsync
                              Policy::template MakeVDramTileDistribution<Problem>());
 
         // prefetch K tile
-        async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, k_oob_ck, k_pre_np);
+        async_load_tile_raw(
+            k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, number<-1>{}, k_oob_ck, k_pre_np);
         move_tile_window(k_dram_window, {0, kK0});
         __builtin_amdgcn_sched_barrier(0);
 
@@ -355,6 +356,7 @@ struct BlockFmhaPipelineQRKSVSAsync
                 static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
                     async_load_tile_raw(k_lds_store(number<LdsSeq.at(number<i_k0 + 1>{})>{}),
                                         k_dram_window,
+                                        number<-1>{},
                                         k_oob_ck,
                                         k_pre_np);
                     if constexpr(i_k0 < k0_loops - 1)
@@ -386,7 +388,7 @@ struct BlockFmhaPipelineQRKSVSAsync
             __builtin_amdgcn_s_barrier();
 
             const auto bias_tile = load_tile(bias_dram_window); // load bias tile
-            auto v_buf           = load_tile(v_dram_window, bool_constant<false>{});
+            auto v_buf           = load_tile(v_dram_window, number<-1>{}, bool_constant<false>{});
             __builtin_amdgcn_sched_barrier(0);
             { // tail
                 gemm_0(s_acc,
@@ -514,7 +516,8 @@ struct BlockFmhaPipelineQRKSVSAsync
                 move_tile_window(
                     v_dram_window,
                     {0, kK1}); // will have scratch if move this right after load_tile(v_dram)...
-                v_buf = load_tile(v_dram_window, bool_constant<false>{}); // load next v_buf
+                v_buf = load_tile(
+                    v_dram_window, number<-1>{}, bool_constant<false>{}); // load next v_buf
             }
             __builtin_amdgcn_sched_barrier(0);
 
@@ -618,7 +621,8 @@ struct BlockFmhaPipelineQRKSVSAsync
                 static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
                     if constexpr(i_k1 != 0 && i_k1 < k1_loops - 1)
                     {
-                        v_buf = load_tile(v_dram_window, bool_constant<false>{}); // load next v_buf
+                        v_buf = load_tile(
+                            v_dram_window, number<-1>{}, bool_constant<false>{}); // load next v_buf
                     }
                     block_sync_lds();
                     gemm_1(o_acc,
@@ -665,8 +669,11 @@ struct BlockFmhaPipelineQRKSVSAsync
                 if constexpr(k1_loops >= 2 &&
                              LdsSeq.at(number<0>{}) == LdsSeq.at(number<k0_loops + k1_loops - 2>{}))
                     __builtin_amdgcn_s_barrier();
-                async_load_tile_raw(
-                    k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, k_oob_ck, k_pre_np);
+                async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})),
+                                    k_dram_window,
+                                    number<-1>{},
+                                    k_oob_ck,
+                                    k_pre_np);
                 move_tile_window(k_dram_window, {0, kK0});
             }
             // tail
diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp
index b74607f06..d23af0af8 100644
--- a/include/ck_tile/ops/fused_moe.hpp
+++ b/include/ck_tile/ops/fused_moe.hpp
@@ -3,7 +3,15 @@
 
 #pragma once
 
+#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp"
+#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp"
+#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp"
 #include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp"
 #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
 #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
 #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp"
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
new file mode 100644
index 000000000..2d25d44f3
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include <string>
+#include <type_traits>
+
+// clang-format off
+// [indexing implementation-1]
+// using M_a as constexpr block_size to partition all tokens into different slices
+// each slice map to one expert, and one expert can have multiple slices
+// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number)
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+//
+// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
+// * this could be larger than actual, since actual tokens are on GPU
+//
+// sorted_token_ids_ptr   : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5]
+//                          |-  exp-0  -|-  exp-1  -|-  exp-2  -|-      exp-3          -|-  exp-4 -|-  exp-5  -|
+// sorted_weight_ptr      : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *, c, f, i, o]
+//
+// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
+//
+// * Note on token_id_per_expert/sorted_token_ids_ptr data:
+// currently we do not have topk information from the data of token_id_per_expert/sorted_token_ids_ptr.
+// In some cases(like smooth-quant), we need topk information to indexing into tokens quant from 
+// different expert smooth quant. So we modify the number stored inside token_id_per_expert/sorted_token_ids_ptr
+//
+//       32bit    0........23 24.....31 bit
+//      (data) -> (token_id | topk_id)
+// low 24 bit is for token id, top 8 bit is for topk id
+//
+// the input after smooth-quant is [token, topk, hidden_dim], originally it is [token, hidden_dim]
+// the input scale for token is [topk, token, 1], the smooth-quant scale for first gemm is [expert, interm_dim]
+//
+// sorted_expert_ids_ptr  : [0, 1, 2, 3, 3, 4, 5]
+// * length is (max_num_tokens_padded + block_size - 1) / block_size
+//
+// num_tokens_post_padded_ptr : [28]
+// num_sorted_tiles_ptr : [7]
+//
+// * different from vLLM
+//   1) token_id stored in sorted_token_ids_ptr is actual token_id, not token_id*top_K expanded id
+//   2）need sorted_weight_ptr
+//   3) use num_sorted_tiles_ptr, already divided by M_a
+//
+// * below used for indexing
+//  1) sorted_token_ids_ptr [max_num_tokens_padded]
+//  2) sorted_weight_ptr
+//  3) sorted_expert_ids_ptr
+//  4）num_tokens_post_padded_ptr/num_sorted_tiles_ptr (select one)
+//
+//   max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1)
+//
+// [indexing implementation-2]
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float number)
+//
+// we generate original rol/col id as
+//              topk_rc_ids : [[0, 5, A], [1, 6, B], [2, 7, C], [3, 8, D], [4, 9, E]]
+// let x be one element of above, we can get:
+//          tpok_row_id(token_id) = x % num_tokens(5)
+//         tpok_col_id(expert_Id) = x / num_tokens
+// topk_row_id/col_id can be used to access original topk_ids/topk_weight
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 5, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+//
+// we can get permuted_rc_ids:
+//                          [[0], [2, 3, 4], [1, 8], [5, 6, 7, D, 9], [], [A, B, C, E]]
+//
+//
+// clang-format on
+//
+namespace ck_tile {
+
+// m: num_tokens (or token*input-batch)
+// k: intermediate_size
+// n: intermediate_size used between 2 FC (TP slice this)
+// e: num expert
+// if doing pre-shuffle
+// nr : n / Block_Nr
+// kr : k / Block_Kr
+// w  : fattened 1d wave buffer
+struct FusedMoeGemmHostArgs
+{
+    const void* a_ptr;              // [m, k], input token
+    const void* a_scale_ptr;        // [m, 1], token scale
+    const void* g_ptr;              // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w])
+    const void* d_ptr;              // [e, n, k], pre-shuffle([e, nr, kr, w])
+    const void* g_scale_ptr;        // [e, 1, n], gate(up) scale
+    const void* d_scale_ptr;        // [e, 1, k], down scale
+    const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input
+    void* o_ptr;                    // [m, k], output token
+
+    const void* sorted_token_ids_ptr;  // [max_num_tokens_padded]
+    const void* sorted_weight_ptr;     // [max_num_tokens_padded]
+    const void* sorted_expert_ids_ptr; // [(max_num_tokens_padded + block_size - 1) / block_size]
+    const void* num_sorted_tiles_ptr;  // [1]
+
+    index_t hidden_size;       // k
+    index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2
+    index_t num_tokens;        // input number of tokens for current iteration
+    index_t num_experts;       // number of groups
+    index_t topk;              // need this?
+
+    index_t stride_token; // for input/output, stride for each row, should >= hidden_size
+};
+
+// This is scatter/gather b2b group-gemm
+template <typename Partitioner_, typename Pipeline_, typename Epilogue_>
+struct FusedMoeGemmKernel
+{
+    using Partitioner = remove_cvref_t<Partitioner_>;
+    using Pipeline    = remove_cvref_t<Pipeline_>;
+    using Epilogue    = remove_cvref_t<Epilogue_>; // TODO: not used
+    // static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu;
+    // static_assert(kBlockPerCu > 0);
+
+    using BlockShape = typename Pipeline::BlockShape; // this is FusedMoeGemmShape
+    static constexpr index_t BlockSize_ = BlockShape::BlockSize;
+
+    using ADataType            = typename Pipeline::Problem::ADataType;
+    using GDataType            = typename Pipeline::Problem::GDataType;
+    using DDataType            = typename Pipeline::Problem::DDataType;
+    using AccDataType          = typename Pipeline::Problem::AccDataType;
+    using ODataType            = typename Pipeline::Problem::ODataType;
+    using AScaleDataType       = typename Pipeline::Problem::AScaleDataType;
+    using GScaleDataType       = typename Pipeline::Problem::GScaleDataType;
+    using DScaleDataType       = typename Pipeline::Problem::DScaleDataType;
+    using YSmoothScaleDataType = typename Pipeline::Problem::YSmoothScaleDataType;
+    using TopkWeightDataType   = typename Pipeline::Problem::TopkWeightDataType;
+    using IndexDataType        = typename Pipeline::Problem::IndexDataType;
+    using YDataType            = typename Pipeline::Problem::YDataType;
+
+    using Traits                = typename Pipeline::Problem::Traits;
+    static constexpr bool UseUK = true;
+
+    static constexpr bool IsGateOnly          = Traits::IsGateOnly;
+    static constexpr bool UseSmoothQuant      = Traits::UseSmoothQuant;
+    static constexpr bool PadHiddenSize       = Traits::PadHiddenSize;
+    static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize;
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<bf8_t> { static constexpr const char * name = "bf8"; };
+    template <> struct t2s<int8_t> { static constexpr const char * name = "int8"; };
+    // clang-format on
+
+    CK_TILE_HOST static std::string GetName()
+    {
+#define _SS_ std::string
+#define _TS_ std::to_string
+        // clang-format off
+        using S_ = BlockShape;
+
+        auto prec_str = [&] () {
+            std::string base_str = _SS_(t2s<ADataType>::name);
+            if (!std::is_same_v<ADataType, GDataType>) {
+                base_str += _SS_("_") + _SS_(t2s<GDataType>::name);
+            }
+            return base_str;
+        }();
+
+        return _SS_("fused_moe_") + _SS_(prec_str) + "_" +
+             _TS_(S_::Block_M0) + "x" + _TS_(S_::Block_N0) + "x" + _TS_(S_::Block_K0) + "x" + _TS_(S_::Block_N1) + "_" +
+             _TS_(S_::WarpPerBlock_M0) + "x" + _TS_(S_::WarpPerBlock_N0) + "x" + _TS_(S_::WarpPerBlock_K0) + "_" +
+             _TS_(S_::Warp_M0) + "x" + _TS_(S_::Warp_N0) + "x" + _TS_(S_::Warp_K0) + "_" + _SS_(Pipeline::name);
+#undef _SS_
+#undef _TS_
+        // clang-format on
+    }
+
+    struct FusedMoeGemmKargs
+    {
+        const void* a_ptr;              // [m, k], input token
+        const void* a_scale_ptr;        // [m, 1], token scale
+        const void* g_ptr;              // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w])
+        const void* d_ptr;              // [e, n, k], pre-shuffle([e, nr, kr, w])
+        const void* g_scale_ptr;        // [e, 1, n], gate(up) scale
+        const void* d_scale_ptr;        // [e, 1, k], down scale
+        const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input
+        void* o_ptr;                    // [m, k], output token
+
+        const void* sorted_token_ids_ptr;
+        const void* sorted_weight_ptr;
+        const void* sorted_expert_ids_ptr;
+        const void* num_sorted_tiles_ptr;
+
+        index_t hidden_size;       // k
+        index_t intermediate_size; // n / TP, for Gate. if Gate+Up, Down need divide by 2
+        index_t num_tokens;        // input number of tokens for current iteration
+        index_t num_experts;       // number of groups
+        index_t topk;              // need this?
+
+        index_t stride_token; // for input/output, stride for each row, should >= hidden_size
+    };
+
+    // TODO: switch karg based on
+    using Kargs = FusedMoeGemmKargs;
+    using Hargs = FusedMoeGemmHostArgs;
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        // TODO: hargs/kargs not guranteed to be the same
+        return bit_cast<Kargs>(hargs);
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        constexpr index_t block_m = BlockShape::Block_M0;
+        int max_num_tokens_padded =
+            hargs.topk * hargs.num_tokens + hargs.num_experts * block_m - hargs.topk;
+        // printf("xxx max_num_tokens_padded:%d\n", max_num_tokens_padded);
+        return Partitioner::GridSize(max_num_tokens_padded, hargs.intermediate_size);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(BlockSize_); }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        if constexpr(UseUK)
+        {
+            __shared__ CK_TILE_LDS_ADDR ADataType smem[GetSmemSize()];
+            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+                *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
+
+            num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0;
+
+            const auto [sorted_tile_id, intermediate_tile_id] =
+                Partitioner{}(num_sorted_tiles, kargs.intermediate_size);
+            // if(threadIdx.x == 0)
+            // printf("bid:%d,%d, num_sorted_tiles:%d, sorted_tile_id:%d(%d),
+            // intermediate_tile_id:%d\n", static_cast<int>(blockIdx.x),
+            //     static_cast<int>(blockIdx.y), num_sorted_tiles, sorted_tile_id, sorted_tile_id >=
+            //     num_sorted_tiles? 1 : 0, intermediate_tile_id);
+            if(sorted_tile_id >= num_sorted_tiles)
+                return;
+
+            Pipeline{}(kargs, smem, sorted_tile_id, intermediate_tile_id);
+        }
+        else
+        {
+            // allocate LDS
+            // __shared__ char smem_ptr[GetSmemSize()];
+            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+                *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
+            constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2;
+
+            index_t nr_0 = kargs.intermediate_size / BlockShape::Block_Nr0;
+            index_t kr_0 = kargs.hidden_size / BlockShape::Block_Kr0;
+            index_t nr_1 = kargs.hidden_size / BlockShape::Block_Nr1; // should be same as kr_0
+            index_t kr_1 =
+                kargs.intermediate_size / BlockShape::Block_Kr1; // should be same as nr_0
+
+            index_t expert_stride_0 = kargs.intermediate_size * hidden_radio_0 * kargs.hidden_size;
+            index_t expert_stride_1 = kargs.intermediate_size * kargs.hidden_size;
+
+            __shared__ CK_TILE_LDS_ADDR ADataType smem[GetSmemSize()];
+
+            // note this is in unit of tile, need multiple tile size to get the index
+            const auto [sorted_tile_id, intermediate_tile_id] =
+                Partitioner{}(num_sorted_tiles, kargs.intermediate_size);
+            if(sorted_tile_id >= num_sorted_tiles)
+                return;
+
+            const IndexDataType expert_id =
+                __builtin_amdgcn_readfirstlane(reinterpret_cast<const IndexDataType*>(
+                    kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
+
+            // index along intermediate_size
+            // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id *
+            // BlockShape::Block_N0);
+            index_t interm_idx_nr =
+                __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0);
+
+            const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col]
+            const auto sorted_token_id =
+                a_coord[number<0>{}] + sorted_tile_id * BlockShape::Block_M0;
+
+            index_t token_id =
+                reinterpret_cast<const index_t*>(kargs.sorted_token_ids_ptr)[sorted_token_id];
+            auto topk_weight = reinterpret_cast<const TopkWeightDataType*>(
+                kargs.sorted_weight_ptr)[sorted_token_id];
+
+            const auto a_window = [&]() {
+                // A is already pre-padded in previous kernel
+                const ADataType* a_ptr = reinterpret_cast<const ADataType*>(kargs.a_ptr);
+                const auto a_view_     = make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(kargs.num_tokens, kargs.hidden_size),
+                    make_tuple(kargs.stride_token, 1),
+                    number<Pipeline::kAlignmentA>{},
+                    number<1>{});
+
+                // gather is here use indexing transform
+                const auto a_gather_view_ = transform_tensor_view(
+                    a_view_,
+                    make_tuple(make_indexing_transform(kargs.num_tokens, token_id),
+                               make_pass_through_transform(kargs.hidden_size)),
+                    make_tuple(sequence<0>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+
+                const auto a_window_ = make_tile_window(
+                    a_gather_view_,
+                    make_tuple(number<BlockShape::Block_M0>{}, number<BlockShape::Block_K0>{}),
+                    {0, 0});
+                return a_window_;
+            }();
+
+            // TODO: gtile using NSub to have less register pressure
+            const auto g_window = [&]() {
+                const GDataType* g_ptr = reinterpret_cast<const GDataType*>(kargs.g_ptr) +
+                                         static_cast<long_index_t>(expert_id) * expert_stride_0 +
+                                         interm_idx_nr * kr_0 * BlockShape::Block_W0;
+                const auto g_view_ = make_naive_tensor_view<address_space_enum::global>(
+                    g_ptr,
+                    make_tuple(nr_0, kr_0, number<BlockShape::Block_W0>{}),
+                    make_tuple(kr_0 * BlockShape::Block_W0, number<BlockShape::Block_W0>{}, 1),
+                    number<Pipeline::kAlignmentG>{},
+                    number<1>{});
+                const auto g_view_1_ =
+                    pad_tensor_view(g_view_,
+                                    make_tuple(number<BlockShape::Block_Nr0>{},
+                                               number<BlockShape::Block_Kr0>{},
+                                               number<BlockShape::Block_W0>{}),
+                                    sequence<PadIntermediateSize, PadHiddenSize, 0>{});
+
+                const auto g_window_ = make_tile_window(g_view_1_,
+                                                        make_tuple(number<BlockShape::Block_Nr0>{},
+                                                                   number<BlockShape::Block_Kr0>{},
+                                                                   number<BlockShape::Block_W0>{}),
+                                                        {0, 0, 0});
+                return g_window_;
+            }();
+
+            const auto d_window = [&]() {
+                const DDataType* d_ptr = reinterpret_cast<const DDataType*>(kargs.d_ptr) +
+                                         static_cast<long_index_t>(expert_id) * expert_stride_1 +
+                                         interm_idx_nr * BlockShape::Block_W1;
+                // note interm_idx_nr is along the gemm-k dim of 2nd gemm
+
+                const auto d_view_ = make_naive_tensor_view<address_space_enum::global>(
+                    d_ptr,
+                    make_tuple(nr_1, kr_1, BlockShape::Block_W1),
+                    make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1),
+                    number<Pipeline::kAlignmentD>{},
+                    number<1>{});
+                const auto d_view_1_ =
+                    pad_tensor_view(d_view_,
+                                    make_tuple(number<BlockShape::Block_Nr1>{},
+                                               number<BlockShape::Block_Kr1>{},
+                                               number<BlockShape::Block_W1>{}),
+                                    sequence<PadHiddenSize, PadIntermediateSize, 0>{});
+
+                const auto d_window_ = make_tile_window(d_view_1_,
+                                                        make_tuple(number<BlockShape::Block_Nr1>{},
+                                                                   number<BlockShape::Block_Kr1>{},
+                                                                   number<BlockShape::Block_W1>{}),
+                                                        {0, 0, 0});
+                return d_window_;
+            }();
+
+            auto o_window = [&]() {
+                ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr);
+                auto o_view_     = make_naive_tensor_view<address_space_enum::global,
+                                                      memory_operation_enum::atomic_add>(
+                    o_ptr,
+                    make_tuple(kargs.num_tokens, kargs.hidden_size),
+                    make_tuple(kargs.stride_token, 1),
+                    number<Pipeline::kAlignmentO>{},
+                    number<1>{});
+
+                // gather is here
+                auto o_scatter_view_ = transform_tensor_view(
+                    o_view_,
+                    make_tuple(make_indexing_transform(kargs.num_tokens, token_id),
+                               make_pass_through_transform(kargs.hidden_size)),
+                    make_tuple(sequence<0>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+
+                auto o_window_ = make_tile_window(
+                    o_scatter_view_,
+                    make_tuple(number<BlockShape::Block_M1>{}, number<BlockShape::Block_N1>{}),
+                    {0, 0});
+                return o_window_;
+            }();
+
+            // do compute yeah
+            Pipeline{}(a_window,
+                       g_window,
+                       d_window,
+                       o_window,
+                       topk_weight,
+                       smem,
+                       kargs.hidden_size,
+                       kargs.intermediate_size,
+                       kargs.stride_token);
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
new file mode 100644
index 000000000..4f3f8bb7d
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+/*
+tensors:
+1. act  (A): input feature map
+2. gate (G): B matrix for first gemm, output will do activation(Silu)
+3. up   (U): B matrix for first gemm
+4. down (D): B matrix for second gemm
+                                                                  N1
+                                                                 /   \
+                                                                 +----------+ |
+                                                                 |   Down   | |
+                                                                 x----------x |
+                        hidden               hidden           K1 |          | |
+                          N0                   N0                x----------x |
+             |   +------x-----x------+------x-----x------+       |          | |
+    dim      |   | Gate |     |      | Up   |     |      |       |          | |
+  contiguous |   |      |     |      |      |     |      |       |          | |
+             |   |      |     |      |      |     |      |       |          | |
+             v   +------x-----x------+------x-----x------+       +----------+ V
+      K0                |     |             |     |                    | contiguous
+     /  \               v     v             v     v                    |
+    +---------+  +------x-----x------+------x-----x------+             |
+M0  |    A    |  |      |     |      |      |     |      |             |
+    +---------+  +------x-----x------+------x-----x------+             |
+    ---------->           |                    |                       |
+    contiguous            |                    V                       V
+                          |                 x-----x              +----------+
+                          +------------> M1 |  Y  |  --------->  |  Out(O)  |
+                             ACT            x-----x              +----------+
+                                              K1 = N0                 dim
+
+* Note: Act could be Gelu/Silu/...
+* Note: some model does not have Up
+*/
+template <typename BlockTile_0_,
+          typename WarpPerBlock_0_,
+          typename WarpTile_0_,
+          typename BlockTile_1_,
+          typename WarpPerBlock_1_,
+          typename WarpTile_1_>
+struct FusedMoeGemmShape
+{
+    using BlockTile_0    = remove_cvref_t<BlockTile_0_>;
+    using WarpPerBlock_0 = remove_cvref_t<WarpPerBlock_0_>;
+    using WarpTile_0     = remove_cvref_t<WarpTile_0_>;
+    using BlockTile_1    = remove_cvref_t<BlockTile_1_>;
+    using WarpPerBlock_1 = remove_cvref_t<WarpPerBlock_1_>;
+    using WarpTile_1     = remove_cvref_t<WarpTile_1_>;
+
+    static constexpr index_t NumWarps =
+        reduce_on_sequence(WarpPerBlock_0{}, multiplies{}, number<1>{});
+
+    // TODO: we don't support half warps aound to 1 warp here
+    static_assert(NumWarps == reduce_on_sequence(WarpPerBlock_1{}, multiplies{}, number<1>{}));
+
+    static constexpr index_t Block_M0        = BlockTile_0::at(number<0>{});
+    static constexpr index_t Block_N0        = BlockTile_0::at(number<1>{});
+    static constexpr index_t Block_K0        = BlockTile_0::at(number<2>{});
+    static constexpr index_t WarpPerBlock_M0 = WarpPerBlock_0::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N0 = WarpPerBlock_0::at(number<1>{});
+    static constexpr index_t WarpPerBlock_K0 = WarpPerBlock_0::at(number<2>{});
+    static constexpr index_t Warp_M0         = WarpTile_0::at(number<0>{});
+    static constexpr index_t Warp_N0         = WarpTile_0::at(number<1>{});
+    static constexpr index_t Warp_K0         = WarpTile_0::at(number<2>{});
+
+    static constexpr index_t ThreadPerBlock_M0 = Warp_M0 * WarpPerBlock_M0;
+    static constexpr index_t ThreadPerBlock_N0 = Warp_N0 * WarpPerBlock_N0;
+    static constexpr index_t ThreadPerBlock_K0 = Warp_K0 * WarpPerBlock_K0;
+    static_assert(Block_M0 % ThreadPerBlock_M0 == 0);
+    static_assert(Block_N0 % ThreadPerBlock_N0 == 0);
+    static_assert(Block_K0 % ThreadPerBlock_K0 == 0);
+    static constexpr index_t Repeat_M0 = Block_M0 / ThreadPerBlock_M0;
+    static constexpr index_t Repeat_N0 = Block_N0 / ThreadPerBlock_N0;
+    static constexpr index_t Repeat_K0 = Block_K0 / ThreadPerBlock_K0;
+
+    static constexpr index_t Block_M1        = BlockTile_1::at(number<0>{});
+    static constexpr index_t Block_N1        = BlockTile_1::at(number<1>{});
+    static constexpr index_t Block_K1        = BlockTile_1::at(number<2>{});
+    static constexpr index_t WarpPerBlock_M1 = WarpPerBlock_1::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N1 = WarpPerBlock_1::at(number<1>{});
+    static constexpr index_t WarpPerBlock_K1 = WarpPerBlock_1::at(number<2>{});
+    static constexpr index_t Warp_M1         = WarpTile_1::at(number<0>{});
+    static constexpr index_t Warp_N1         = WarpTile_1::at(number<1>{});
+    static constexpr index_t Warp_K1         = WarpTile_1::at(number<2>{});
+
+    static constexpr index_t ThreadPerBlock_M1 = Warp_M1 * WarpPerBlock_M1;
+    static constexpr index_t ThreadPerBlock_N1 = Warp_N1 * WarpPerBlock_N1;
+    static constexpr index_t ThreadPerBlock_K1 = Warp_K1 * WarpPerBlock_K1;
+    static_assert(Block_M1 % ThreadPerBlock_M1 == 0);
+    static_assert(Block_N1 % ThreadPerBlock_N1 == 0);
+    static_assert(Block_K1 % ThreadPerBlock_K1 == 0);
+    static constexpr index_t Repeat_M1 = Block_M1 / ThreadPerBlock_M1;
+    static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1;
+    static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1;
+
+    static constexpr index_t BlockSize = warpSize * NumWarps;
+
+    // some assert
+    static_assert(Block_M0 == Block_M1);
+    static_assert(Block_N0 == Block_K1 || (Block_N0 / 2) == Block_K1); // Gate Only or Gate+Up
+
+    // pre-shuffle tile size compute (assume only for B matrix)
+    // we flatten the each wave tile to a 1d linear tensor(at model loading time)
+    // e.g. originally we have Block_N*Block_K tile size, after pre-shuffle
+    // we can have Block_Nr*Block_Kr*Block_W, where Block_W is Warp_N*Warp_K,
+    // and Block_Nr=Block_N/Warp_N, Block_Kr=Block_K/Warp_K
+    static constexpr index_t Block_W0  = Warp_N0 * Warp_K0;
+    static constexpr index_t Block_Nr0 = Block_N0 / Warp_N0;
+    static constexpr index_t Block_Kr0 = Block_K0 / Warp_K0;
+    static constexpr index_t Block_W1  = Warp_N1 * Warp_K1;
+    static constexpr index_t Block_Nr1 = Block_N1 / Warp_N1;
+    static constexpr index_t Block_Kr1 = Block_K1 / Warp_K1;
+
+    static_assert(Block_W0 == Block_W1);
+    // static_assert(Block_Nr0 == Block_Kr1);
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp
new file mode 100644
index 000000000..381edb650
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner.hpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck_tile {
+
+template <typename BlockShape_>
+struct FusedMoeGemmTilePartitioner_Linear
+{
+    //  FusedMoeGemmShape
+    using BlockShape = ck_tile::remove_cvref_t<BlockShape_>;
+
+    static constexpr const char* name = "lin";
+
+    CK_TILE_DEVICE auto operator()(ck_tile::index_t /*num_sorted_tiles*/,
+                                   ck_tile::index_t /*intermediate_size*/)
+    {
+        index_t i_n = blockIdx.x;
+        index_t i_m = blockIdx.y;
+
+        return ck_tile::make_tuple(i_m, i_n);
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t max_tokens, index_t intermediate_size)
+    {
+        // TODO: this may need tuning
+        index_t ms = ck_tile::integer_divide_ceil(max_tokens, BlockShape::Block_M0);
+        index_t ns = ck_tile::integer_divide_ceil(intermediate_size, BlockShape::Block_N0);
+        return dim3(ns, ms, 1);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp
new file mode 100644
index 000000000..e9577e230
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp
@@ -0,0 +1,651 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp"
+
+namespace ck_tile {
+
+/*
+This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight)
+we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave)
+
+    <----- gemm-N ------>
+    +----+----+----+----+
+    | w0 | w1 | w2 | w3 | gemm-m
+    +----+----+----+----+
+*/
+template <typename Problem_, typename Policy_ = FusedMoeGemmPipelineFlatmmPolicy>
+struct FusedMoeGemmPipeline_FlatmmEx
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using BlockShape = typename Problem::BlockShape; // this is FusedMoeGemmShape
+
+    using ADataType            = typename Problem::ADataType;
+    using GDataType            = typename Problem::GDataType;
+    using DDataType            = typename Problem::DDataType;
+    using AccDataType          = typename Problem::AccDataType;
+    using ODataType            = typename Problem::ODataType;
+    using AScaleDataType       = typename Problem::AScaleDataType;
+    using GScaleDataType       = typename Problem::GScaleDataType;
+    using DScaleDataType       = typename Problem::DScaleDataType;
+    using YSmoothScaleDataType = typename Problem::YSmoothScaleDataType;
+    using TopkWeightDataType   = typename Problem::TopkWeightDataType;
+    using IndexDataType        = typename Problem::IndexDataType;
+    using YDataType            = typename Problem::YDataType;
+
+    using Traits = typename Problem::Traits;
+
+    static constexpr bool IsGateOnly          = Traits::IsGateOnly;
+    static constexpr bool UseSmoothQuant      = Traits::UseSmoothQuant;
+    static constexpr bool PadHiddenSize       = Traits::PadHiddenSize;
+    static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize;
+
+    static constexpr index_t kAlignmentA = Policy::template GetAlignment_A<Problem>();
+    static constexpr index_t kAlignmentG = Policy::template GetAlignment_G<Problem>();
+    static constexpr index_t kAlignmentD = Policy::template GetAlignment_D<Problem>();
+    static constexpr index_t kAlignmentO = Policy::template GetAlignment_O<Problem>();
+
+    static constexpr index_t SLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::SLD_A);
+    static constexpr index_t GLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_A);
+    static constexpr index_t GLD_B = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_B);
+    static constexpr index_t GST_O = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GST_O);
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            // minimize occupancy
+            return 2;
+        }
+    }();
+
+    static constexpr const char* name = "fused_moe_flatmm";
+
+    // TODO: there are multiple buffers
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_A()
+    {
+        return Policy::template GetSmemSize_A<Problem>();
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    // this is the thread-offset along row/col
+    CK_TILE_HOST_DEVICE static auto GetACoord()
+    {
+        constexpr auto a_dist = Policy::template MakeGlobalTileDistribution_A<Problem>();
+        const auto a_coord    = a_dist.calculate_index();
+        return a_coord;
+    }
+
+    // this is the thread-offset along row/col
+    CK_TILE_HOST_DEVICE static auto GetOCoord()
+    {
+        constexpr auto o_dist = Policy::template MakeOGlobalTileDistribution<Problem>();
+        const auto o_coord    = o_dist.calculate_index();
+        return o_coord;
+    }
+
+    template <typename AWindow, typename GWindow, typename DWindow, typename OWindow>
+    CK_TILE_DEVICE auto operator()(const AWindow& a_window_,
+                                   const GWindow& g_window_,
+                                   const DWindow& d_window_,
+                                   OWindow& o_window_,
+                                   TopkWeightDataType /*topk_weight*/,
+                                   CK_TILE_LDS_ADDR void* smem,
+                                   index_t hidden_size,
+                                   index_t intermediate_size)
+    {
+        _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wc++20-extensions\"");
+        constexpr auto NEG1  = number<-1>{};
+        constexpr auto I0    = number<0>{};
+        constexpr auto I1    = number<1>{};
+        constexpr auto TRUE  = bool_constant<true>{};
+        constexpr auto FALSE = bool_constant<false>{};
+
+        CK_TILE_LDS_ADDR ADataType* smem_0 = reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem);
+        CK_TILE_LDS_ADDR ADataType* smem_1 = reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(
+            reinterpret_cast<CK_TILE_LDS_ADDR char*>(smem) +
+            Policy::template GetSmemSize_A<Problem>());
+
+        auto g_view = g_window_.get_bottom_tensor_view();
+
+        auto u_view = [&]() {
+            if constexpr(IsGateOnly)
+            {
+                return g_view;
+            }
+            else
+            {
+                index_t nr_0 = intermediate_size / BlockShape::Block_Nr0;
+                index_t kr_0 = hidden_size / BlockShape::Block_Kr0;
+
+                const GDataType* g_ptr =
+                    g_window_.get_bottom_tensor_view().get_buffer_view().p_data_;
+                const GDataType* u_ptr = g_ptr + (nr_0 / 2) * kr_0 * number<BlockShape::Block_W0>{};
+
+                const auto u_view_ = make_naive_tensor_view<address_space_enum::global>(
+                    u_ptr,
+                    make_tuple(nr_0, kr_0, number<BlockShape::Block_W0>{}),
+                    make_tuple(kr_0 * BlockShape::Block_W0, number<BlockShape::Block_W0>{}, 1),
+                    number<kAlignmentG>{},
+                    number<1>{});
+                const auto u_view_1_ =
+                    pad_tensor_view(u_view_,
+                                    make_tuple(number<BlockShape::Block_Nr0>{},
+                                               number<BlockShape::Block_Kr0>{},
+                                               number<BlockShape::Block_W0>{}),
+                                    sequence<PadIntermediateSize, PadHiddenSize, 0>{});
+                return u_view_1_;
+            }
+        }();
+
+        auto a_win = make_tile_window_linear(
+            a_window_, Policy::template MakeGlobalTileDistribution_A<Problem>());
+        auto g_win =
+            make_tile_window_linear(g_window_,
+                                    Policy::template MakeGlobalTileDistribution_G<Problem>(),
+                                    sequence<0, 1, 1>{});
+        auto d_win =
+            make_tile_window_linear(d_window_,
+                                    Policy::template MakeGlobalTileDistribution_D<Problem>(),
+                                    sequence<0, 1, 1>{});
+        auto o_win = make_tile_window_linear(
+            o_window_, Policy::template MakeGlobalTileDistribution_O<Problem>());
+
+        using g_thread_type = decltype(load_tile(g_win));
+        using d_thread_type = decltype(load_tile(d_win));
+
+        using WarpGemm0  = decltype(Policy::template GetWarpGemm0<Problem>());
+        using WarpGemm1  = decltype(Policy::template GetWarpGemm1<Problem>());
+        auto warp_gemm_0 = WarpGemm0{};
+        auto warp_gemm_1 = WarpGemm1{};
+
+        // issues_warps_lanes
+        auto a_sst_win0 =
+            make_tile_window(make_tensor_view<address_space_enum::lds>(
+                                 smem_0, Policy::template MakeLdsStoreDesc_A<Problem>()),
+                             Policy::template MakeLdsStoreDesc_A<Problem>().get_lengths(),
+                             {0, 0, 0});
+
+        auto a_sst_win1 =
+            make_tile_window(make_tensor_view<address_space_enum::lds>(
+                                 smem_1, Policy::template MakeLdsStoreDesc_A<Problem>()),
+                             Policy::template MakeLdsStoreDesc_A<Problem>().get_lengths(),
+                             {0, 0, 0});
+        // m*k
+        auto a_sld_win0 = [&]() {
+            using WG                        = WarpGemm0;
+            constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<BlockShape::Repeat_M0, BlockShape::WarpPerBlock_M0>,
+                      sequence<BlockShape::Repeat_K0>>,
+                tuple<sequence<1>>,
+                tuple<sequence<1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                a_outer_dstr_enc, typename WG::AWarpDstrEncoding{});
+            return make_tile_window_linear(
+                make_tensor_view<address_space_enum::lds>(
+                    smem_0, Policy::template MakeLdsLoadDesc_A<Problem>()),
+                Policy::template MakeLdsLoadDesc_A<Problem>().get_lengths(),
+                {0, 0},
+                make_static_tile_distribution(a_block_dstr_encode));
+        }();
+
+        // m*k
+        auto a_sld_win1 = [&]() {
+            using WG                        = WarpGemm0;
+            constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<BlockShape::Repeat_M0, BlockShape::WarpPerBlock_M0>,
+                      sequence<BlockShape::Repeat_K0>>,
+                tuple<sequence<1>>,
+                tuple<sequence<1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                a_outer_dstr_enc, typename WG::AWarpDstrEncoding{});
+            return make_tile_window_linear(
+                make_tensor_view<address_space_enum::lds>(
+                    smem_1, Policy::template MakeLdsLoadDesc_A<Problem>()),
+                Policy::template MakeLdsLoadDesc_A<Problem>().get_lengths(),
+                {0, 0},
+                make_static_tile_distribution(a_block_dstr_encode));
+        }();
+
+        auto bridge_sst_win = [&]() {
+            return make_tile_window(
+                make_tensor_view<address_space_enum::lds>(
+                    reinterpret_cast<YDataType*>(smem),
+                    Policy::template MakeBridgeLdsStoreDesc<Problem>()),
+                Policy::template MakeBridgeLdsStoreDesc<Problem>().get_lengths(),
+                {0, 0});
+        }();
+
+        auto bridge_sld_win = [&]() {
+            return make_tile_window_linear(
+                make_tensor_view<address_space_enum::lds>(
+                    reinterpret_cast<YDataType*>(smem),
+                    Policy::template MakeBridgeLdsLoadDesc<Problem>()),
+                Policy::template MakeBridgeLdsLoadDesc<Problem>().get_lengths(),
+                {0, 0},
+                Policy::template MakeYTileDistribution<Problem>());
+        }();
+
+        // also OK with C array, 2 register buffer
+        statically_indexed_array<g_thread_type, 2> gs;
+
+        constexpr auto issues_a = number<a_win.get_num_of_access()>{};
+        constexpr auto issues_g = number<g_win.get_num_of_access()>{};
+        // constexpr auto issues_d = number<d_win.get_num_of_access()>{};
+        // constexpr auto issues_o = number<o_win.get_num_of_access()>{};
+        constexpr auto issues_gemm0 =
+            number<BlockShape::Repeat_M0 * BlockShape::Repeat_N0 * BlockShape::Repeat_K0 *
+                   warp_gemm_0.get_num_of_access()>{};
+        constexpr auto issues_gemm1 =
+            number<BlockShape::Repeat_M1 * BlockShape::Repeat_N1 * BlockShape::Repeat_K1 *
+                   warp_gemm_1.get_num_of_access()>{};
+        // constexpr auto issues_sld_a = number<a_sld_win0.get_num_of_access()>{};
+
+        const index_t num_blocks_k0 =
+            (hidden_size + BlockShape::Block_K0 - 1) / BlockShape::Block_K0;
+        const index_t num_blocks_n1 =
+            (hidden_size + BlockShape::Block_N1 - 1) / BlockShape::Block_N1;
+
+        using a_thread_type = decltype(load_tile(a_sld_win0));
+        statically_indexed_array<a_thread_type, 2> as;
+
+        auto gld_a = [&]<typename PreNop = bool_constant<false>>(
+            auto& a_store_, auto i_access, PreNop = {})
+        {
+            async_load_tile_raw(a_store_, a_win, i_access, PreNop{});
+        };
+        auto move_a = [&]() {
+            move_tile_window(a_win, {number<0>{}, number<BlockShape::Block_K0>{}});
+        };
+        auto sld_a = [&](auto& a_, auto& win_, auto i_access) {
+            load_tile_raw(a_, win_, i_access);
+        };
+
+        auto gld_g = [&]<typename PreNop = bool_constant<false>>(
+            auto& g_, auto i_access, PreNop = {})
+        {
+            if constexpr(IsGateOnly)
+            {
+                // TODO: hack!
+                if constexpr(i_access.value == 0)
+                {
+                    g_win.bottom_tensor_view_ = g_view;
+                }
+                else if constexpr(i_access.value == issues_g / 2)
+                {
+                    g_win.bottom_tensor_view_ = u_view;
+                }
+            }
+            load_tile_raw(g_, g_win, i_access, FALSE, PreNop{});
+        };
+        auto move_g = [&]() {
+            move_tile_window(g_win, {number<0>{}, number<BlockShape::Block_Kr0>{}, number<0>{}});
+        };
+        statically_indexed_array<d_thread_type, 2> ds;
+
+        auto gld_d = [&]<typename PreNop = bool_constant<false>>(
+            auto& d_, auto i_access, PreNop = {})
+        {
+            load_tile_raw(d_, d_win, i_access, FALSE, PreNop{});
+        };
+        auto move_d = [&]() {
+            // d move along gemm-n
+            move_tile_window(d_win, {number<BlockShape::Block_N1>{}, number<0>{}});
+        };
+
+        auto atomic_add_o = [&]<typename PreNop = bool_constant<false>>(
+            auto& o_, auto i_access, PreNop = {})
+        {
+            update_tile_raw(o_win, o_, i_access, TRUE, PreNop{});
+        };
+
+        auto acc_0  = Policy::template MakeCBlockTile_Gemm0<Problem>();
+        auto acc_1s = generate_tuple(
+            [&](auto) { return Policy::template MakeCBlockTile_Gemm1<Problem>(); }, number<2>{});
+
+        // clang-format off
+        auto gemm_0 = [&]<typename PostNop = bool_constant<false>>
+        (auto& t_c, auto& t_a, auto& t_b, auto i_access, PostNop = {}) {
+            using WarpGemm = remove_cvref_t<decltype(warp_gemm_0)>;
+
+            constexpr auto repeat_sub = WarpGemm::get_num_of_access();
+            constexpr auto repeat_m = BlockShape::Repeat_M0;
+            // constexpr auto repeat_n = BlockShape::Repeat_N0;
+            constexpr auto repeat_k = BlockShape::Repeat_K0;
+            // loop order n->m->k
+            constexpr auto i_sub = i_access % repeat_sub;
+            constexpr auto i_k = (i_access / repeat_sub) % repeat_k;
+            constexpr auto i_m = (i_access / (repeat_sub * repeat_k )) % repeat_m;
+            constexpr auto i_n = (i_access / (repeat_sub * repeat_k )) / repeat_m;
+
+            using AWarpTensor = typename WarpGemm::AWarpTensor;
+            using BWarpTensor = typename WarpGemm::BWarpTensor;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;
+            using AWarpDstr = typename WarpGemm::AWarpDstr;
+            using BWarpDstr = typename WarpGemm::BWarpDstr;
+            using CWarpDstr = typename WarpGemm::CWarpDstr;
+
+            constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+            constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+            constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+            constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+            constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+            AWarpTensor w_a;
+            w_a.get_thread_buffer() = t_a.get_y_sliced_thread_data(
+                    merge_sequences(sequence<i_m, i_k>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+            BWarpTensor w_b;
+            w_b.get_thread_buffer() = t_b.get_y_sliced_thread_data(
+                merge_sequences(sequence<i_n, i_k>{}, b_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+            CWarpTensor w_c;
+            w_c.get_thread_buffer() = t_c.get_y_sliced_thread_data(
+                        merge_sequences(sequence<i_m, i_n>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+            warp_gemm_0(w_c, w_a, w_b, number<i_sub>{}, PostNop{});
+
+            t_c.set_y_sliced_thread_data(
+                        merge_sequences(sequence<i_m, i_n>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        w_c.get_thread_buffer());
+        };
+        // clang-format on
+
+        // clang-format off
+        auto gemm_1 = [&]<typename PostNop = bool_constant<false>>
+        (auto& t_c, auto& t_a, auto& t_b, auto i_access, PostNop = {}) {
+            using WarpGemm = remove_cvref_t<decltype(warp_gemm_1)>;
+
+            constexpr auto repeat_sub = WarpGemm::get_num_of_access();
+            constexpr auto repeat_m = BlockShape::Repeat_M0;
+            // constexpr auto repeat_n = BlockShape::Repeat_N0;
+            constexpr auto repeat_k = BlockShape::Repeat_K0;
+            // loop order n->m->k
+            constexpr auto i_sub = i_access % repeat_sub;
+            constexpr auto i_k = (i_access / repeat_sub) % repeat_k;
+            constexpr auto i_m = (i_access / (repeat_sub * repeat_k )) % repeat_m;
+            constexpr auto i_n = (i_access / (repeat_sub * repeat_k )) / repeat_m;
+
+            using AWarpTensor = typename WarpGemm::AWarpTensor;
+            using BWarpTensor = typename WarpGemm::BWarpTensor;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;
+            using AWarpDstr = typename WarpGemm::AWarpDstr;
+            using BWarpDstr = typename WarpGemm::BWarpDstr;
+            using CWarpDstr = typename WarpGemm::CWarpDstr;
+
+            constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+            constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+            constexpr auto a_warp_y_lengths = to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+            constexpr auto b_warp_y_lengths = to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+            constexpr auto c_warp_y_lengths = to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+            AWarpTensor w_a;
+            w_a.get_thread_buffer() = t_a.get_y_sliced_thread_data(
+                    merge_sequences(sequence<i_m, i_k>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+            BWarpTensor w_b;
+            w_b.get_thread_buffer() = t_b.get_y_sliced_thread_data(
+                merge_sequences(sequence<i_n, i_k>{}, b_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+            CWarpTensor w_c;
+            w_c.get_thread_buffer() = t_c.get_y_sliced_thread_data(
+                        merge_sequences(sequence<i_m, i_n>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+            warp_gemm_1(w_c, w_a, w_b, number<i_sub>{}, PostNop{});
+
+            t_c.set_y_sliced_thread_data(
+                        merge_sequences(sequence<i_m, i_n>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        w_c.get_thread_buffer());
+        };
+        // clang-format on
+        _Pragma("clang diagnostic pop");
+
+        // this gemm pipeline is designed with assumption that issues of buffer-load/ds_read can
+        // be hide under mfma. In other words, issues of mfma is >= memory this is true if we
+        // pre-shuffle B matrix, and A matrix is relatively small we prefer use multiple mfma
+        // paired with 1 buffer-load B matrix, to get max throughput of buffer_load. and by
+        // preshuffle, we always pack to dwordx4 load, and this will already extend to multiple
+        // mfma but that is already consumed inside warpgemm-impl. So indeed how many extra
+        // mfma(that can reuse the B matrix) only affected by M repeat.
+        auto pipeline_gemm0 = [&]() {
+            constexpr index_t total_loops = issues_gemm0;
+            constexpr auto sr             = Policy::template GetSequencer_0<Problem>();
+            static_assert(sr.size() == total_loops);
+
+            constexpr auto c_sld_a_0 = MAKE_SC();
+            constexpr auto c_gld_a_0 = MAKE_SC();
+            constexpr auto c_gld_b_0 = MAKE_SC();
+            // compute buffer 1
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                gemm_0(acc_0, as[I0], gs[I0], i_issue);
+                constexpr index_t slot = sr.at(i_issue);
+
+                if constexpr(slot & SLD_A)
+                    sld_a(as[I1], a_sld_win1, number<NEXT_SCI(c_sld_a_0, i_issue)>{});
+                if constexpr(slot & GLD_A)
+                    gld_a(a_sst_win0, number<NEXT_SCI(c_gld_a_0, i_issue)>{});
+                if constexpr(slot & GLD_B)
+                    gld_g(gs[I0], number<NEXT_SCI(c_gld_b_0, i_issue)>{});
+            });
+            move_g();
+            move_a();
+            block_sync_load_raw(issues_a + issues_g);
+            lds_load_fence();
+
+            constexpr auto c_sld_a_1 = MAKE_SC();
+            constexpr auto c_gld_a_1 = MAKE_SC();
+            constexpr auto c_gld_b_1 = MAKE_SC();
+
+            // compute buffer 1
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                gemm_0(acc_0, as[I1], gs[I1], i_issue);
+                constexpr index_t slot = sr.at(i_issue);
+
+                if constexpr(slot & SLD_A)
+                    sld_a(as[I0], a_sld_win0, number<NEXT_SCI(c_sld_a_1, i_issue)>{});
+                if constexpr(slot & GLD_A)
+                    gld_a(a_sst_win1, number<NEXT_SCI(c_gld_a_1, i_issue)>{});
+                if constexpr(slot & GLD_B)
+                    gld_g(gs[I1], number<NEXT_SCI(c_gld_b_1, i_issue)>{});
+            });
+            move_g();
+            move_a();
+            block_sync_load_raw(issues_a + issues_g);
+            lds_load_fence();
+        };
+
+        auto pipeline_gemm0_tail = [&]() {
+            constexpr index_t total_loops = issues_gemm0;
+            constexpr auto sr             = Policy::template GetSequencer_0<Problem>();
+            static_assert(sr.size() == total_loops);
+
+            constexpr auto c_gld_b_0 = MAKE_SC();
+
+            // compute buffer 0
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                gemm_0(acc_0, as[I0], gs[I0], i_issue);
+                constexpr index_t slot = sr.at(i_issue);
+
+                if constexpr(slot & GLD_B)
+                    gld_g(gs[I1], number<NEXT_SCI(c_gld_b_0, i_issue)>{});
+            });
+
+            block_sync_load_raw(issues_g);
+            sld_a(as[I1], a_sld_win1, NEG1);
+
+            // compute buffer 1
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                constexpr auto last_nop = [&]() {
+                    if constexpr(i_issue == (total_loops - 1))
+                        return TRUE;
+                    else
+                        return FALSE;
+                }();
+                gemm_0(acc_0, as[I1], gs[I1], i_issue, last_nop); // last gemm has nop
+            });
+        };
+
+        auto y = Policy::template MakeYBlockTile<Problem>();
+
+        auto pipeline_bridge = [&]() {
+            // cast to Y data
+            auto y_pre = cast_tile<YDataType>(acc_0);
+            store_tile(bridge_sst_win, y_pre);
+            clear_tile(acc_1s(I0));
+            // wave_barrier();
+            load_tile(y, bridge_sld_win);
+            clear_tile(acc_1s(I1));
+        };
+
+        // note, gemm-1 start from idx-1 to N-2 (0, 1, 2....N-1)
+        auto pipeline_gemm1 = [&]() {
+            constexpr index_t total_loops = issues_gemm1;
+            constexpr auto sr             = Policy::template GetSequencer_1<Problem>();
+            static_assert(sr.size() == total_loops);
+
+            constexpr auto c_gld_b_0 = MAKE_SC();
+            constexpr auto c_gst_o_0 = MAKE_SC();
+            constexpr auto c_gld_b_1 = MAKE_SC();
+            constexpr auto c_gst_o_1 = MAKE_SC();
+
+            // compute buffer 0
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                gemm_1(acc_1s[I1], y, ds[I1], i_issue);
+                constexpr index_t slot = sr.at(i_issue);
+                if constexpr(slot & GLD_B)
+                    gld_d(ds[I0], number<NEXT_SCI(c_gld_b_0, i_issue)>{});
+
+                if constexpr(slot & GST_O)
+                {
+                    auto out = cast_tile<ODataType>(acc_1s[I0]);
+                    atomic_add_o(out, number<NEXT_SCI(c_gst_o_0, i_issue)>{});
+                }
+            });
+            move_d();
+            // move_o();
+
+            // compute buffer 1
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                gemm_1(acc_1s[I0], y, ds[I0], i_issue);
+                constexpr index_t slot = sr.at(i_issue);
+                if constexpr(slot & GLD_B)
+                    gld_d(ds[I1], number<NEXT_SCI(c_gld_b_1, i_issue)>{});
+
+                if constexpr(slot & GST_O)
+                {
+                    auto out = cast_tile<ODataType>(acc_1s[I1]);
+                    atomic_add_o(out, number<NEXT_SCI(c_gst_o_1, i_issue)>{});
+                }
+            });
+            move_d();
+        };
+
+        auto pipeline_gemm1_head = [&]() {
+            constexpr index_t total_loops = issues_gemm1;
+            constexpr auto sr             = Policy::template GetSequencer_1<Problem>();
+            static_assert(sr.size() == total_loops);
+
+            constexpr auto c_gld_b_0 = MAKE_SC();
+
+            // compute buffer 0
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                gemm_1(acc_1s[I0], y, ds[I0], i_issue);
+                constexpr index_t slot = sr.at(i_issue);
+                if constexpr(slot & GLD_B)
+                    gld_d(ds[I1], number<NEXT_SCI(c_gld_b_0, i_issue)>{});
+            });
+            move_d();
+        };
+        auto pipeline_gemm1_tail = [&]() {
+            constexpr index_t total_loops = issues_gemm1;
+            constexpr auto sr             = Policy::template GetSequencer_1<Problem>();
+            static_assert(sr.size() == total_loops);
+
+            constexpr auto c_gst_o_0 = MAKE_SC();
+
+            // compute buffer 1
+            static_for<0, total_loops, 1>{}([&](auto i_issue) {
+                gemm_1(acc_1s[I1], y, ds[I1], i_issue);
+
+                constexpr index_t slot = sr.at(i_issue);
+                if constexpr(slot & GST_O)
+                {
+                    auto out = cast_tile<ODataType>(acc_1s[I0]);
+                    atomic_add_o(out, number<NEXT_SCI(c_gst_o_0, i_issue)>{});
+                }
+            });
+            {
+                auto out = cast_tile<ODataType>(acc_1s[I1]);
+                atomic_add_o(out, NEG1);
+            }
+        };
+
+        // start of pipeline
+        // clang-format off
+        gld_a(a_sst_win0, NEG1, TRUE);
+        gld_g(gs[I0], NEG1, TRUE);
+        move_a();
+        move_g();
+        clear_tile(acc_0);
+
+        // preload for next round
+        gld_a(a_sst_win1, NEG1); 
+        gld_g(gs[I1], NEG1);
+
+        // make sure a,g loaded
+        block_sync_load_raw(issues_a + issues_g);
+        lds_load_fence();
+
+        // we manually unroll double buffer inside hot loop
+        const index_t iters_0 = (num_blocks_k0 - 2) / 2;
+        index_t i_0 = 0; // (void)i_0; (void)iters_0; (void)pipeline_gemm0;
+        while(i_0++ < iters_0)
+        {
+            pipeline_gemm0();
+        }
+        pipeline_gemm0_tail();
+
+        pipeline_bridge();
+
+        const index_t iters_1 = (num_blocks_n1 - 2) / 2;
+        index_t i_1 = 0; // (void) i_1; (void)iters_1; (void)pipeline_gemm1;
+        pipeline_gemm1_head();
+        while(i_1++ < iters_1)
+        {
+            pipeline_gemm1();
+        }
+        pipeline_gemm1_tail();
+        // clang-format on
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
new file mode 100644
index 000000000..fea30f029
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -0,0 +1,831 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp"
+#include "ck_tile/ops/flatmm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+
+namespace ck_tile {
+
+struct FusedMoeGemmPipelineFlatmmPolicy
+{
+    CK_TILE_HOST_DEVICE static constexpr index_t GetAsyncCopyDwords()
+    {
+        // TODO: always 1 dword
+        return 1;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_A()
+    {
+        // using async
+        constexpr index_t copy_bytes = 4 * GetAsyncCopyDwords();
+        constexpr index_t data_bytes = sizeof(typename Problem::ADataType);
+        static_assert(copy_bytes % data_bytes == 0);
+        return copy_bytes / data_bytes;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_G()
+    {
+        constexpr index_t copy_bytes = [&]() { return 16; }();
+        constexpr index_t data_bytes = sizeof(typename Problem::GDataType);
+        static_assert(copy_bytes % data_bytes == 0);
+        return copy_bytes / data_bytes;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_D()
+    {
+        constexpr index_t copy_bytes = [&]() { return 16; }();
+        constexpr index_t data_bytes = sizeof(typename Problem::DDataType);
+        static_assert(copy_bytes % data_bytes == 0);
+        return copy_bytes / data_bytes;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignment_O()
+    {
+        if constexpr(Problem::Traits::OAtomic == 1)
+        {
+            // pack fp16/bf16 atomic
+            static_assert(sizeof(typename Problem::ODataType) == 2);
+            return 2;
+        }
+        else if constexpr(Problem::Traits::OAtomic == 2)
+        {
+            // fp32 atomic
+            return 1;
+        }
+        else
+        {
+            return 16 / sizeof(typename Problem::ODataType);
+        }
+    }
+
+    template <typename DataType_>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack()
+    {
+        // TODO: this is for 3d layout
+        return 16 / sizeof(remove_cvref_t<DataType_>);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack_A()
+    {
+        return GetSmemKPack<typename Problem::ADataType>();
+    }
+
+    // used for bridge LDS shuffle
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPack_Y()
+    {
+        // TODO: this should match mfma layout
+        return 16 / sizeof(typename Problem::YDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_A()
+    {
+        constexpr auto a_sld_desc = MakeLdsLoadDesc_A<Problem>();
+        constexpr auto a_sst_desc = MakeLdsStoreDesc_A<Problem>();
+        static_assert(a_sld_desc.get_element_space_size() == a_sst_desc.get_element_space_size());
+        return a_sld_desc.get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize_Bridge()
+    {
+        constexpr auto bridge_sld_desc = MakeBridgeLdsLoadDesc<Problem>();
+        constexpr auto bridge_sst_desc = MakeBridgeLdsStoreDesc<Problem>();
+        static_assert(bridge_sld_desc.get_element_space_size() ==
+                      bridge_sst_desc.get_element_space_size());
+        return bridge_sld_desc.get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        constexpr index_t a_lds      = GetSmemSize_A<Problem>();
+        constexpr index_t bridge_lds = GetSmemSize_Bridge<Problem>();
+        return max(a_lds, bridge_lds);
+    }
+
+    template <index_t MPerBlock, index_t KPerBlock, index_t NumWarps, index_t Alignment>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_SimpleMxK()
+    {
+        constexpr index_t K_vec = Alignment;
+        constexpr index_t K_rem = KPerBlock / K_vec;
+
+        if constexpr(get_warp_size() < K_rem)
+        {
+            static_assert(K_rem % get_warp_size() == 0);
+            constexpr index_t K_lan = get_warp_size(); // lane within same wave is along gemm-k
+            constexpr index_t K_wav = K_rem / get_warp_size();
+            static_assert(K_wav <= NumWarps, "not not support thread has repeat along K yet");
+            constexpr index_t M_wav = NumWarps / K_wav;
+            static_assert(MPerBlock % M_wav == 0, "this tile size is too small please check");
+            constexpr index_t M_rep = MPerBlock / M_wav;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,
+                    tuple<sequence<M_rep, M_wav>, sequence<K_wav, K_lan, K_vec>>,
+                    tuple<sequence<1, 2>, sequence<2>>,
+                    tuple<sequence<1, 0>, sequence<1>>,
+                    sequence<1, 2>,
+                    sequence<0, 2>>{});
+        }
+        else
+        {
+            constexpr index_t K_lan = K_rem;
+            constexpr index_t M_lan = get_warp_size() / K_lan;
+            constexpr index_t M_wav = NumWarps;
+            static_assert(MPerBlock % (M_lan * M_wav) == 0,
+                          "this tile size is too small please check");
+            constexpr index_t M_rep = MPerBlock / (M_lan * M_wav);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,
+                    tuple<sequence<M_rep, M_wav, M_lan>, sequence<K_lan, K_vec>>,
+                    tuple<sequence<1>, sequence<1, 2>>,
+                    tuple<sequence<1>, sequence<2, 0>>,
+                    sequence<1, 2>,
+                    sequence<0, 1>>{});
+        }
+    }
+
+    // optimized version for async, not same as simple MXK dist(pay attention!!)
+    template <index_t MPerBlock, index_t KPerBlock, index_t NumWarps, index_t Alignment>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_SimpleMxK_Async()
+    {
+        constexpr index_t K_vec = Alignment;
+        constexpr index_t K_rem = KPerBlock / K_vec;
+
+        if constexpr(get_warp_size() <= K_rem)
+        {
+            static_assert(K_rem % get_warp_size() == 0);
+            constexpr index_t K_lan = get_warp_size(); // lane within same wave is along gemm-k
+            constexpr index_t K_wav = K_rem / get_warp_size();
+            static_assert(K_wav <= NumWarps, "do not support thread has repeat along K yet");
+            constexpr index_t M_wav = NumWarps / K_wav;
+            static_assert(MPerBlock % M_wav == 0, "this tile size is too small please check");
+            constexpr index_t M_rep = MPerBlock / M_wav;
+            // NOTE: no swap, but hard to avoid LDS bank conflict
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,
+                    tuple<sequence<M_rep, M_wav>, sequence<K_wav, K_lan, K_vec>>,
+                    tuple<sequence<1, 2>, sequence<2>>,
+                    tuple<sequence<1, 0>, sequence<1>>,
+                    sequence<1, 2>,
+                    sequence<0, 2>>{});
+        }
+        else
+        {
+            constexpr index_t K_lan = K_rem;
+            constexpr index_t M_lan = get_warp_size() / K_lan;
+            constexpr index_t M_wav = NumWarps;
+            static_assert(MPerBlock % (M_lan * M_wav) == 0,
+                          "this tile size is too small please check");
+            constexpr index_t M_rep = MPerBlock / (M_lan * M_wav);
+            // NOTE: swapped for LDS load bank conflict free
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,
+                    // Note M_wave(num waves) is the fastest dim, different from sipmle 2d
+                    // distribution
+                    tuple<sequence<M_rep, M_lan, M_wav>, sequence<K_lan, K_vec>>,
+                    tuple<sequence<1>, sequence<1, 2>>,
+                    tuple<sequence<2>, sequence<1, 0>>,
+                    sequence<1, 2>,
+                    sequence<0, 1>>{});
+        }
+    }
+
+    template <index_t WarpPerBlock_N_,
+              index_t WarpPerBlock_K_,
+              index_t Repeat_N_,
+              index_t Repeat_K_,
+              index_t WarpSize_,
+              index_t Alignment_>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_Nr_Kr_W()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<Repeat_N_, WarpPerBlock_N_>,
+                                             sequence<Repeat_K_, WarpPerBlock_K_>,
+                                             sequence<WarpSize_, Alignment_>>,
+                                       tuple<sequence<1, 2>, sequence<3>>,
+                                       tuple<sequence<1, 1>, sequence<0>>,
+                                       sequence<1, 2, 3>,
+                                       sequence<0, 0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_A()
+    {
+        constexpr index_t Block_M_   = Problem::BlockShape::Block_M0;
+        constexpr index_t Block_K_   = Problem::BlockShape::Block_K0;
+        constexpr index_t NumWarps_  = Problem::BlockShape::NumWarps;
+        constexpr index_t Alignment_ = GetAlignment_A<Problem>();
+        return MakeGlobalTileDistribution_SimpleMxK_Async<Block_M_,
+                                                          Block_K_,
+                                                          NumWarps_,
+                                                          Alignment_>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_G()
+    {
+        constexpr auto PermuteEnum = Problem::Traits::PermuteEnum;
+        // constexpr index_t hidden_radio_0 = Problem::Traits::IsGateOnly ? 1 : 2;
+        using S_ = typename Problem::BlockShape;
+        if constexpr(PermuteEnum == FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten)
+        {
+            // number<S_::WarpPerBlock_N0>{}.rrr();
+            // number<S_::Repeat_N0>{}.eee();
+            return MakeGlobalTileDistribution_Nr_Kr_W<S_::WarpPerBlock_N0,
+                                                      S_::WarpPerBlock_K0,
+                                                      S_::Repeat_N0, /// hidden_radio_0,
+                                                      S_::Repeat_K0,
+                                                      get_warp_size(),
+                                                      GetAlignment_G<Problem>()>();
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_D()
+    {
+        constexpr auto PermuteEnum = Problem::Traits::PermuteEnum;
+        using S_                   = typename Problem::BlockShape;
+        if constexpr(PermuteEnum == FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten)
+        {
+            return MakeGlobalTileDistribution_Nr_Kr_W<S_::WarpPerBlock_N1,
+                                                      S_::WarpPerBlock_K1,
+                                                      S_::Repeat_N1,
+                                                      S_::Repeat_K1,
+                                                      get_warp_size(),
+                                                      GetAlignment_D<Problem>()>();
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeGlobalTileDistribution_O()
+    {
+        using S_       = remove_cvref_t<typename Problem::BlockShape>;
+        using WarpGemm = remove_cvref_t<decltype(GetWarpGemm1<Problem>())>;
+        // using CDataType = typename WarpGemm::CDataType;
+
+        constexpr auto c_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<S_::Repeat_M1, S_::WarpPerBlock_M1>,
+                                             sequence<S_::Repeat_N1, S_::WarpPerBlock_N1>>,
+                                       tuple<sequence<1, 2>>,
+                                       tuple<sequence<1, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        return c_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A()
+    {
+        // A async->LDS
+        constexpr index_t Block_M = Problem::BlockShape::Block_M0;
+        constexpr index_t Block_K = Problem::BlockShape::Block_K0;
+        // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
+        constexpr index_t warpSize = ck_tile::get_warp_size();
+        constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
+
+        constexpr index_t KPack   = GetSmemKPack_A<Problem>(); // LDS
+        constexpr index_t KVector = GetAlignment_A<Problem>(); // async copy 1 dword
+        constexpr index_t KPad    = KPack;                     // pad between warps
+
+        static_assert(Block_K % KVector == 0);
+        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
+        if constexpr(LanesPerK >= warpSize)
+        {
+            // need multiple waves to load K
+            static_assert(LanesPerK % warpSize == 0);
+            constexpr index_t wavesPerK = LanesPerK / warpSize;
+            if constexpr(wavesPerK > NumWarps)
+            {
+                // TODO: need multiple issues along K to load all data
+            }
+            else
+            {
+                constexpr index_t wavesPerM     = NumWarps / wavesPerK;
+                constexpr index_t NumIssues     = Block_M / wavesPerM;
+                constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                    make_tuple(number<NumIssues>{},                             // m0
+                               number<wavesPerM>{},                             // m1
+                               number<wavesPerK>{},                             // k0
+                               number<warpSize>{},                              // k1
+                               number<KVector>{}),                              // k2
+                    make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{},  // m0
+                               number<wavesPerK*(warpSize * KVector + KPad)>{}, // m1
+                               number<warpSize * KVector + KPad>{},             // k0
+                               number<KVector>{},                               // k1
+                               number<1>{}),                                    // k2
+                    number<KVector>{}, // lds store vector(actually no explicit store)
+                    number<1>{});
+
+                constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
+                    lds_block_desc_0,
+                    make_tuple(
+                        make_pass_through_transform(number<NumIssues>{}),
+                        make_merge_transform(make_tuple(number<wavesPerM>{}, number<wavesPerK>{})),
+                        make_merge_transform(make_tuple(number<warpSize>{}, number<KVector>{}))),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+                return lds_block_desc_issues_warps_lanes;
+            }
+        }
+        else
+        {
+            // lanes within a wave load different M but same K
+            static_assert(warpSize % LanesPerK == 0);
+            constexpr index_t LaneGroups = warpSize / LanesPerK; // along m
+            constexpr index_t NumIssues  = Block_M / (LaneGroups * NumWarps);
+
+            constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<NumIssues>{},                            // m0
+                           number<LaneGroups>{},                           // m1
+                           number<NumWarps>{},                             // m2
+                           number<LanesPerK>{},                            // k0
+                           number<KVector>{}),                             // k1
+                make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
+                           number<Block_K>{},                              // m1
+                           number<warpSize * KVector + KPad>{},            // m2
+                           number<KVector>{},                              // k0
+                           number<1>{}),                                   // k1
+                number<KVector>{}, // lds store vector(actually no explicit store)
+                number<1>{});
+
+            constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
+                lds_block_desc_0,
+                make_tuple(make_pass_through_transform(number<NumIssues>{}),
+                           make_pass_through_transform(number<NumWarps>{}),
+                           make_merge_transform(make_tuple(
+                               number<LaneGroups>{}, number<LanesPerK>{}, number<KVector>{}))),
+                make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+            return lds_block_desc_issues_warps_lanes;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A()
+    {
+        // A async->LDS
+        // Note that, this descriptor is only to construct the layout inside LDS
+        // in real Gemm pipeline, ds_read may not follow this pattern
+        // (may follow that in tile_distribution)
+        // below code is almost the same as SmemStore dist, with difference:
+        //  1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc
+        //  2). return discriptor is in NxK 2d layout
+        constexpr index_t Block_M = Problem::BlockShape::Block_M0;
+        constexpr index_t Block_K = Problem::BlockShape::Block_K0;
+        // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
+        constexpr index_t warpSize = ck_tile::get_warp_size();
+        constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
+
+        constexpr index_t KPack   = GetSmemKPack_A<Problem>(); // LDS
+        constexpr index_t KVector = GetAlignment_A<Problem>(); // async copy 1 dword
+        constexpr index_t KPad    = KPack;                     // pad between warps
+
+        static_assert(Block_K % KVector == 0);
+        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
+        if constexpr(LanesPerK >= warpSize)
+        {
+            // need multiple waves to load K
+            static_assert(LanesPerK % warpSize == 0);
+            constexpr index_t wavesPerK = LanesPerK / warpSize;
+            if constexpr(wavesPerK >= NumWarps)
+            {
+                // TODO: need multiple issues along K to load all data
+            }
+            else
+            {
+                constexpr index_t wavesPerM     = NumWarps / wavesPerK;
+                constexpr index_t NumIssues     = Block_M / wavesPerM;
+                constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                    make_tuple(number<NumIssues>{},                             // m0
+                               number<wavesPerM>{},                             // m1
+                               number<wavesPerK>{},                             // k0
+                               number<warpSize>{},                              // k1
+                               number<KVector>{}),                              // k2
+                    make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{},  // m0
+                               number<wavesPerK*(warpSize * KVector + KPad)>{}, // m1
+                               number<warpSize * KVector + KPad>{},             // k0
+                               number<KVector>{},                               // k1
+                               number<1>{}),                                    // k2
+                    number<KPack>{},                                            // lds load vector
+                    number<1>{});
+
+                constexpr auto lds_desc_m_k = transform_tensor_descriptor(
+                    lds_block_desc_0,
+                    make_tuple(
+                        make_merge_transform(make_tuple(number<NumIssues>{}, number<wavesPerM>{})),
+                        make_merge_transform(make_tuple(
+                            number<wavesPerK>{}, number<warpSize>{}, number<KVector>{}))),
+                    make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+
+                return lds_desc_m_k;
+            }
+        }
+        else
+        {
+            // lanes within a wave load different M but same K
+            static_assert(warpSize % LanesPerK == 0);
+            constexpr index_t LaneGroups = warpSize / LanesPerK; // along m
+            constexpr index_t NumIssues  = Block_M / (LaneGroups * NumWarps);
+
+            constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<NumIssues>{},                            // m0
+                           number<LaneGroups>{},                           // m1
+                           number<NumWarps>{},                             // m2
+                           number<LanesPerK>{},                            // k0
+                           number<KVector>{}),                             // k1
+                make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
+                           number<Block_K>{},                              // m1
+                           number<warpSize * KVector + KPad>{},            // m2
+                           number<KVector>{},                              // k0
+                           number<1>{}),                                   // k1
+                number<KPack>{},                                           // lds load vector
+                number<1>{});
+
+            constexpr auto lds_desc_m_k = transform_tensor_descriptor(
+                lds_block_desc_0,
+                make_tuple(
+                    make_merge_transform(
+                        make_tuple(number<NumIssues>{}, number<LaneGroups>{}, number<NumWarps>{})),
+                    make_merge_transform(make_tuple(number<LanesPerK>{}, number<KVector>{}))),
+                make_tuple(sequence<0, 1, 2>{}, sequence<3, 4>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return lds_desc_m_k;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsLoadDesc()
+    {
+        constexpr index_t Block_M = Problem::BlockShape::Block_M0;
+        constexpr index_t Block_N = Problem::BlockShape::Block_N0;
+
+        constexpr index_t KVector = GetSmemKPack_Y<Problem>(); // async copy 1 dword
+        constexpr index_t KPad    = 0;                         // pad between warps
+
+        constexpr auto desc =
+            make_naive_tensor_descriptor(make_tuple(number<Block_M>{}, number<Block_N>{}),
+                                         make_tuple(number<Block_N + KPad>{}, number<1>{}),
+                                         number<KVector>{},
+                                         number<1>{});
+        return desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreDesc()
+    {
+        constexpr index_t Block_M = Problem::BlockShape::Block_M0;
+        constexpr index_t Block_N = Problem::BlockShape::Block_N0;
+
+        constexpr index_t KVector = GetSmemKPack_Y<Problem>(); // async copy 1 dword
+        constexpr index_t KPad    = 0; // KVector;                   // pad between warps
+
+        constexpr auto desc =
+            make_naive_tensor_descriptor(make_tuple(number<Block_M>{}, number<Block_N>{}),
+                                         make_tuple(number<Block_N + KPad>{}, number<1>{}),
+                                         number<KVector>{},
+                                         number<1>{});
+        return desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreForUKDesc()
+    {
+        constexpr index_t WarpPerBlock_N = Problem::BlockShape::WarpPerBlock_N0;
+        constexpr index_t Repeat_N       = Problem::BlockShape::Repeat_N0;
+        constexpr index_t Repeat_M       = Problem::BlockShape::Repeat_M0;
+
+        constexpr index_t kAMLane     = 16;
+        constexpr index_t kABKLane    = 4;
+        constexpr index_t kABKPerLane = 4;
+
+        constexpr index_t KPack = kABKPerLane;
+
+        constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<Repeat_M>{},                                               // m
+                       number<Repeat_N>{},                                               // n
+                       number<WarpPerBlock_N>{},                                         // n
+                       number<kABKLane>{},                                               // n
+                       number<kAMLane>{},                                                // m
+                       number<KPack>{}),                                                 // n
+            make_tuple(number<Repeat_N * WarpPerBlock_N * kABKLane * kAMLane * KPack>{}, //  m
+                       number<WarpPerBlock_N * kABKLane * kAMLane * KPack>{},            //  n
+                       number<kABKLane * kAMLane * KPack>{},                             //  n
+                       number<kAMLane * KPack>{},                                        //  n
+                       number<KPack>{},                                                  //  m
+                       number<1>{}),                                                     //  n
+            number<KPack>{}, // lds store vector(actually no explicit store)
+            number<1>{});
+
+        constexpr auto desc = transform_tensor_descriptor(
+            lds_block_desc_0,
+            make_tuple(make_merge_transform(make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
+                       make_merge_transform(make_tuple(number<Repeat_N>{},
+                                                       number<WarpPerBlock_N>{},
+                                                       number<kABKLane>{},
+                                                       number<KPack>{}))),
+            make_tuple(sequence<0, 4>{}, sequence<1, 2, 3, 5>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm0()
+    {
+        using S_ = typename Problem::BlockShape;
+        // A is vgpr, B is agpr. But since we transposed, so also need swap this
+        // TODO: this is ugly
+        constexpr auto wg_ctrl = WGAttrCtlEnum::Raw_avv;
+        // TODO: ugly
+        if constexpr(std::is_same_v<typename Problem::ADataType, ck_tile::bf16_t> &&
+                     std::is_same_v<typename Problem::GDataType, ck_tile::bf16_t> &&
+                     S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16)
+        {
+            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+                WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<wg_ctrl>,
+                2>>{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::ADataType, ck_tile::int8_t> &&
+                          std::is_same_v<typename Problem::GDataType, ck_tile::int8_t> &&
+                          S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32)
+        {
+            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+                WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<wg_ctrl>,
+                2>>{};
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSequencer_0()
+    {
+        // this function return seq<...> used to identify gld/sld/valu... inside mfma sequence
+        // the purpose is to hide thoes instructions under mfma
+        // every value inside seq<...> is a mask, indicating a specific operation
+        using S_                = typename Problem::BlockShape;
+        constexpr index_t SLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::SLD_A);
+        constexpr index_t GLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_A);
+        constexpr index_t GLD_B = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_B);
+        if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                     std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                     S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16 &&
+                     S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 &&
+                     S_::Block_N1 == 128)
+        {
+            // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
+            // gld_a 8x ds_read_b128 sld_a total 64 slot :)
+            // clang-format off
+            constexpr auto seq_all =
+                    //       0       1       2        3       4      5        6       7
+                   sequence<GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,    // 0
+                            GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,    // 1
+                            GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A,    // 2
+                            GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A,    // 3
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 4
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 5
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 6
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0>{}; // 7
+            return seq_all;
+            // clang-format on
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                          S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16 &&
+                          S_::Block_M0 == 32 && S_::Block_N0 == 256 && S_::Block_K0 == 128 &&
+                          S_::Block_N1 == 128)
+        {
+            // Total 32 instructions, 16 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
+            // gld_a 8x ds_read_b128 sld_a total 64 slot :)
+            // clang-format off
+            constexpr auto seq_all =
+                    //       0       1       2        3       4      5        6       7
+                   sequence<GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,    // 0
+                            GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,  GLD_B,  GLD_A,    // 1
+                            GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A,    // 2
+                            GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A,  GLD_B,  SLD_A>{};    // 3
+            return seq_all;
+            // clang-format on
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSequencer_1()
+    {
+        // this function return seq<...> used to identify gld/sld/valu... inside mfma sequence
+        // the purpose is to hide thoes instructions under mfma
+        // every value inside seq<...> is a mask, indicating a specific operation
+        using S_                = typename Problem::BlockShape;
+        constexpr index_t GLD_B = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_B);
+        constexpr index_t GST_O = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GST_O);
+        if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                     std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                     S_::Warp_M1 == 32 && S_::Warp_N1 == 32 && S_::Warp_K1 == 16 &&
+                     S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 &&
+                     S_::Block_N1 == 128)
+        {
+            // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
+            // gld_a 8x ds_read_b128 sld_a total 64 slot :)
+            // clang-format off
+            constexpr auto seq_all =
+                    //       0       1       2        3       4      5        6       7
+                   sequence<GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,    // 0
+                            GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,    // 1
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 2
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 3
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 4
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 5
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 6
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0>{}; // 7
+            return seq_all;
+            // clang-format on
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                          S_::Warp_M1 == 32 && S_::Warp_N1 == 32 && S_::Warp_K1 == 16 &&
+                          S_::Block_M0 == 32 && S_::Block_N0 == 256 && S_::Block_K0 == 128 &&
+                          S_::Block_N1 == 128)
+        {
+            // Total 64 instructions, 32 buffer-load-dwordx4 gld_b, 8x buffer-load-dwordx1-async
+            // gld_a 8x ds_read_b128 sld_a total 64 slot :)
+            // clang-format off
+            constexpr auto seq_all =
+                    //       0       1       2        3       4      5        6       7
+                   sequence<GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,    // 0
+                            GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,  GLD_B,  GST_O,    // 1
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0,    // 2
+                            GLD_B,      0,  GLD_B,      0,  GLD_B,      0,  GLD_B,      0>{};    // 3
+            return seq_all;
+            // clang-format on
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm1()
+    {
+        using S_               = typename Problem::BlockShape;
+        constexpr auto wg_ctrl = WGAttrCtlEnum::Raw_avv;
+        // TODO: ugly
+        if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                     std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                     S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16)
+        {
+            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+                WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<wg_ctrl>,
+                2>>{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::int8_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::int8_t> &&
+                          S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32)
+        {
+            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+                WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<wg_ctrl>,
+                2>>{};
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeCBlockTile_Gemm0()
+    {
+        using S_        = remove_cvref_t<typename Problem::BlockShape>;
+        using WarpGemm  = remove_cvref_t<decltype(GetWarpGemm0<Problem>())>;
+        using CDataType = typename WarpGemm::CDataType;
+
+        constexpr auto c_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<S_::Repeat_M0, S_::WarpPerBlock_M0>,
+                                             sequence<S_::Repeat_N0, S_::WarpPerBlock_N0>>,
+                                       tuple<sequence<1, 2>>,
+                                       tuple<sequence<1, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeCBlockTile_Gemm1()
+    {
+        using S_        = remove_cvref_t<typename Problem::BlockShape>;
+        using WarpGemm  = remove_cvref_t<decltype(GetWarpGemm1<Problem>())>;
+        using CDataType = typename WarpGemm::CDataType;
+
+        constexpr auto c_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<S_::Repeat_M1, S_::WarpPerBlock_M1>,
+                                             sequence<S_::Repeat_N1, S_::WarpPerBlock_N1>>,
+                                       tuple<sequence<1, 2>>,
+                                       tuple<sequence<1, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    // this is used as A matrix for 2nd gemm
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeYTileDistribution()
+    {
+        using S_       = remove_cvref_t<typename Problem::BlockShape>;
+        using WarpGemm = remove_cvref_t<decltype(GetWarpGemm1<Problem>())>;
+
+        // TODO: all waves a along different N, but same M
+        constexpr auto y_outer_dstr_enc =
+            tile_distribution_encoding<sequence<S_::WarpPerBlock_M1>,
+                                       tuple<sequence<S_::Repeat_M1>, sequence<S_::Repeat_K1>>,
+                                       tuple<sequence<0>>,
+                                       tuple<sequence<0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto y_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            y_outer_dstr_enc, typename WarpGemm::AWarpDstrEncoding{});
+        constexpr auto y_block_dstr = make_static_tile_distribution(y_block_dstr_encode);
+        return y_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeYBlockTile()
+    {
+        constexpr auto y_block_dstr = MakeYTileDistribution<Problem>();
+        auto y_block_tensor =
+            make_static_distributed_tensor<typename Problem::YDataType>(y_block_dstr);
+        return y_block_tensor;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetUK_0()
+    {
+        using S_ = typename Problem::BlockShape;
+        if constexpr(std::is_same_v<typename Problem::ADataType, ck_tile::bf16_t> &&
+                     std::is_same_v<typename Problem::GDataType, ck_tile::bf16_t> &&
+                     S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 &&
+                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+        {
+            return Flatmm_32x512x128_1x4x1_16x16x32_BF16{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::ADataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::GDataType, ck_tile::fp16_t> &&
+                          S_::Block_M0 == 32 && S_::Block_N0 == 512 && S_::Block_K0 == 128 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+        {
+            return Flatmm_32x512x128_1x4x1_16x16x32_FP16{};
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetUK_1()
+    {
+        using S_ = typename Problem::BlockShape;
+        if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                     std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                     std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                     S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+        {
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+        {
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
new file mode 100644
index 000000000..a6f71eafa
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp"
+
+namespace ck_tile {
+
+/*
+This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight)
+we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave)
+
+    <----- gemm-N ------>
+    +----+----+----+----+
+    | w0 | w1 | w2 | w3 | gemm-m
+    +----+----+----+----+
+*/
+template <typename Problem_, typename Policy_ = FusedMoeGemmPipelineFlatmmPolicy>
+struct FusedMoeGemmPipeline_FlatmmUk
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using BlockShape = typename Problem::BlockShape; // this is FusedMoeGemmShape
+
+    using ADataType            = typename Problem::ADataType;
+    using GDataType            = typename Problem::GDataType;
+    using DDataType            = typename Problem::DDataType;
+    using AccDataType          = typename Problem::AccDataType;
+    using ODataType            = typename Problem::ODataType;
+    using AScaleDataType       = typename Problem::AScaleDataType;
+    using GScaleDataType       = typename Problem::GScaleDataType;
+    using DScaleDataType       = typename Problem::DScaleDataType;
+    using YSmoothScaleDataType = typename Problem::YSmoothScaleDataType;
+    using TopkWeightDataType   = typename Problem::TopkWeightDataType;
+    using IndexDataType        = typename Problem::IndexDataType;
+    using YDataType            = typename Problem::YDataType;
+
+    using Traits = typename Problem::Traits;
+
+    static constexpr bool IsGateOnly          = Traits::IsGateOnly;
+    static constexpr bool UseSmoothQuant      = Traits::UseSmoothQuant;
+    static constexpr bool PadHiddenSize       = Traits::PadHiddenSize;
+    static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize;
+
+    static constexpr index_t kAlignmentA = Policy::template GetAlignment_A<Problem>();
+    static constexpr index_t kAlignmentG = Policy::template GetAlignment_G<Problem>();
+    static constexpr index_t kAlignmentD = Policy::template GetAlignment_D<Problem>();
+    static constexpr index_t kAlignmentO = Policy::template GetAlignment_O<Problem>();
+
+    static constexpr index_t SLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::SLD_A);
+    static constexpr index_t GLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_A);
+    static constexpr index_t GLD_B = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_B);
+    static constexpr index_t GST_O = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GST_O);
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            // minimize occupancy
+            return 2;
+        }
+    }();
+
+    static constexpr const char* name = "flatmm_uk";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        constexpr index_t smem_0 = Policy::template GetUK_0<Problem>().GetSmemSize();
+        constexpr index_t smem_1 = Policy::template GetUK_1<Problem>().GetSmemSize();
+        constexpr index_t smem_bridge =
+            BlockShape::Block_M0 * BlockShape::Block_N0 * sizeof(YDataType);
+        return max(smem_0, max(smem_1, smem_bridge));
+    }
+
+    // this is the thread-offset along row/col
+    CK_TILE_HOST_DEVICE static auto GetACoord()
+    {
+        constexpr auto a_dist = Policy::template MakeGlobalTileDistribution_A<Problem>();
+        const auto a_coord    = a_dist.calculate_index();
+        return a_coord;
+    }
+
+    // this is the thread-offset along row/col
+    CK_TILE_HOST_DEVICE static auto GetOCoord()
+    {
+        constexpr auto o_dist = Policy::template MakeOGlobalTileDistribution<Problem>();
+        const auto o_coord    = o_dist.calculate_index();
+        return o_coord;
+    }
+
+    CK_TILE_DEVICE constexpr auto GetNumRowCoords_A()
+    {
+        constexpr index_t KLans   = BlockShape::Block_K0 / kAlignmentA;
+        constexpr index_t MLans   = BlockShape::BlockSize / KLans;
+        constexpr index_t MRepeat = BlockShape::Block_M0 / MLans;
+
+        return MRepeat;
+    }
+
+    // TODO: properlly support scatter/gather
+    CK_TILE_DEVICE auto GetRowCoords_A(index_t base_offset)
+    {
+        constexpr index_t KLans   = BlockShape::Block_K0 / kAlignmentA;
+        constexpr index_t MLans   = BlockShape::BlockSize / KLans;
+        constexpr index_t MRepeat = BlockShape::Block_M0 / MLans;
+
+        auto base_coord = threadIdx.x / KLans + base_offset;
+
+        array<index_t, MRepeat> coords;
+        static_for<0, MRepeat, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLans; });
+
+        return coords;
+    }
+
+    template <typename ROW_COORDS>
+    CK_TILE_DEVICE auto GetRowID(const ROW_COORDS coords, const IndexDataType* sorted_token_ids_ptr)
+    {
+        constexpr index_t n_size = coords.size();
+
+        array<index_t, n_size> row_ids;
+        static_for<0, n_size, 1>{}([&](auto i) {
+            row_ids.at(i) = sorted_token_ids_ptr[coords[i]]; // base_coord + i * MLans;
+        });
+
+        return row_ids;
+    }
+
+    template <typename ROW_COORDS>
+    CK_TILE_DEVICE auto GetWeightScale(const ROW_COORDS coords,
+                                       const TopkWeightDataType* sorted_weight_ptr)
+    {
+        constexpr index_t n_size = coords.size();
+
+        array<TopkWeightDataType, n_size> w;
+        static_for<0, n_size, 1>{}([&](auto i) {
+            w.at(i) = sorted_weight_ptr[coords[i]]; // base_coord + i * MLans;
+        });
+
+        return w;
+    }
+
+    // TODO: this row id is before shuffle atomic, need use acc distribution
+    CK_TILE_DEVICE auto GetRowCoords_O(index_t base_offset)
+    {
+        constexpr index_t MLanes   = BlockShape::Warp_M1;
+        constexpr index_t Repeat_M = BlockShape::Repeat_M1;
+
+        auto base_coord = threadIdx.x % MLanes + base_offset;
+
+        array<index_t, Repeat_M> coords;
+        static_for<0, Repeat_M, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLanes; });
+
+        return coords;
+    }
+
+    template <typename Karg>
+    CK_TILE_DEVICE auto operator()(const Karg& kargs,
+                                   CK_TILE_LDS_ADDR void* smem,
+                                   index_t sorted_tile_id,
+                                   index_t intermediate_tile_id)
+    {
+        constexpr index_t hidden_radio_0            = IsGateOnly ? 1 : 2;
+        ck_tile::index_t shared_intermediate_size_0 = kargs.intermediate_size;
+        ck_tile::index_t shared_intermediate_size_1 = kargs.intermediate_size / hidden_radio_0;
+
+        index_t nr_0 = shared_intermediate_size_0 / BlockShape::Warp_N0; // divide N in W
+        index_t kr_0 = kargs.hidden_size / BlockShape::Warp_K0;          // divide K in W
+        index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1;
+        index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1;
+
+        const IndexDataType expert_id = __builtin_amdgcn_readfirstlane(
+            reinterpret_cast<const IndexDataType*>(kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
+        index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size;
+        index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size;
+
+        // nr*kr*w
+        index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane(
+            intermediate_tile_id *
+            BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W)
+
+        index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane(
+            intermediate_tile_id *
+            BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W)
+
+        auto row_coords_a = GetRowCoords_A(sorted_tile_id * BlockShape::Block_M0);
+        auto row_ids_a    = GetRowID(
+            row_coords_a, reinterpret_cast<const IndexDataType*>(kargs.sorted_token_ids_ptr));
+        auto a_coords = generate_tuple(
+            [&](auto i) {
+                return row_ids_a[i] * kargs.stride_token +
+                       threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA;
+            },
+            number<row_ids_a.size()>{});
+        auto a_res =
+            make_wave_buffer_resource(reinterpret_cast<const ADataType*>(kargs.a_ptr),
+                                      kargs.num_tokens * kargs.stride_token * sizeof(ADataType));
+
+        auto g_win = [&]() {
+            const GDataType* g_ptr = reinterpret_cast<const GDataType*>(kargs.g_ptr) +
+                                     static_cast<long_index_t>(expert_id) * expert_stride_0 +
+                                     interm_idx_nr0 * kr_0 * BlockShape::Block_W0;
+            auto g_view_ = make_naive_tensor_view<address_space_enum::global>(
+                g_ptr,
+                make_tuple(nr_0, kr_0, number<BlockShape::Block_W0>{}),
+                make_tuple(kr_0 * BlockShape::Block_W0, number<BlockShape::Block_W0>{}, 1),
+                number<kAlignmentG>{},
+                number<1>{});
+
+            auto g_window_ = make_tile_window_linear_raw(
+                g_view_,
+                make_tuple(number<BlockShape::Block_Nr0>{},
+                           number<BlockShape::Block_Kr0>{},
+                           number<BlockShape::Block_W0>{}),
+                {0, 0, 0},
+                Policy::template MakeGlobalTileDistribution_G<Problem>(),
+                sequence<0, 1, 1>{});
+            return g_window_;
+        }();
+
+        auto g_res    = g_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_;
+        auto g_coords = generate_tuple([&](auto i) { return g_win.cached_coords_[i].get_offset(); },
+                                       number<decltype(g_win)::NumAccess_NonLinear>{});
+
+        const auto d_win = [&]() {
+            const DDataType* d_ptr = reinterpret_cast<const DDataType*>(kargs.d_ptr) +
+                                     static_cast<long_index_t>(expert_id) * expert_stride_1 +
+                                     interm_idx_kr1 * BlockShape::Block_W1;
+            // note interm_idx_nr0 is along the gemm-k dim of 2nd gemm
+
+            const auto d_view_ = make_naive_tensor_view<address_space_enum::global>(
+                d_ptr,
+                make_tuple(nr_1, kr_1, BlockShape::Block_W1),
+                make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1),
+                number<kAlignmentD>{},
+                number<1>{});
+
+            const auto d_window_ = make_tile_window_linear_raw(
+                d_view_,
+                make_tuple(number<BlockShape::Block_Nr1>{},
+                           number<BlockShape::Block_Kr1>{},
+                           number<BlockShape::Block_W1>{}),
+                {0, 0, 0},
+                Policy::template MakeGlobalTileDistribution_D<Problem>(),
+                sequence<0, 1, 1>{});
+            return d_window_;
+        }();
+        auto d_res = d_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_;
+
+        // TODO: load D order is N0.K0...127, N64.K0...127, N0.K128...255, N64.K128...255
+        //      block-k=512, block-n=128
+        //                    wg                     |<----- W_   ----->|
+        //       Nr(2)*Nw(4)* Kr *Kr0(4)*Kr1(4) * [Kl(4)*Nl(16)*Kv(8)]->one issue
+        //          y   p          y     y         p     p       y
+        //          1              2     0(imm)
+        auto d_coords = [&]() {
+            constexpr index_t Nr_          = 2;
+            constexpr index_t Nw_          = 4;
+            constexpr index_t Kr0_         = 4;
+            constexpr index_t Kr1_         = 4;
+            constexpr index_t Kl_          = 4;
+            constexpr index_t Nl_          = 16;
+            constexpr index_t Kv_          = 8;
+            constexpr index_t W_           = Kl_ * Nl_ * Kv_;
+            constexpr index_t num_offsets_ = Nr_ * Kr0_;
+            index_t base_os_               = (threadIdx.x % 64) * Kv_ + (threadIdx.x / 64) *
+                                                              shared_intermediate_size_1 *
+                                                              Nl_; // Kr0_ * Kr1_ * W_;
+            return generate_tuple(
+                [&](auto i) {
+                    constexpr auto i_nr_  = number<i % Nr_>{};
+                    constexpr auto i_kr0_ = number<i / Nr_>{};
+
+                    return i_nr_ * shared_intermediate_size_1 * Nw_ * Nl_ + i_kr0_ * Kr1_ * W_ +
+                           base_os_;
+                },
+                number<num_offsets_>{});
+        }();
+
+        auto o_coords = generate_tuple(
+            [&](auto i) {
+                return row_ids_a[i] * kargs.stride_token +
+                       threadIdx.x % (BlockShape::Block_N1 / kAlignmentO) * kAlignmentO;
+            },
+            number<row_ids_a.size()>{});
+
+        auto o_flags =
+            generate_tuple([&](auto i) { return cmp_lt_to_exec(row_ids_a[i], kargs.num_tokens); },
+                           number<row_ids_a.size()>{});
+
+        auto bridge_sst_win = [&]() {
+            constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc<Problem>();
+            constexpr auto dist_ = Policy::template GetUK_0<Problem>().MakeCBlockDist();
+            return make_tile_window_linear(make_tensor_view<address_space_enum::lds>(
+                                               reinterpret_cast<YDataType*>(smem), desc_),
+                                           desc_.get_lengths(),
+                                           {0, 0},
+                                           dist_);
+        }();
+        auto o_res =
+            make_wave_buffer_resource(reinterpret_cast<const ODataType*>(kargs.o_ptr),
+                                      kargs.num_tokens * kargs.stride_token * sizeof(ODataType));
+
+        auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0);
+        auto w_scale      = GetWeightScale(
+            row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr));
+
+        auto uk_0  = Policy::template GetUK_0<Problem>();
+        auto acc_0 = uk_0(a_res,
+                          a_coords,
+                          g_res,
+                          g_coords,
+                          smem,
+                          kargs.hidden_size,
+                          BlockShape::Block_K0, // tile offset for B matrix each unroll
+                          BlockShape::Block_Kr0 *
+                              BlockShape::Block_W0); // tile offset for B matrix each unroll
+
+        sweep_tile(
+            acc_0,
+            [&](auto idx0, auto idx1) {
+                fp32x2_t v_{acc_0(idx0), acc_0(idx1)};
+                typename Problem::GateActivation{}(v_, v_);
+                acc_0(idx0) = v_.x;
+                acc_0(idx1) = v_.y;
+            },
+            sequence<1, 2>{});
+
+        auto y_pre = cast_tile<YDataType>(acc_0);
+
+        block_sync_lds();
+
+        store_tile(bridge_sst_win, y_pre);
+        block_sync_lds();
+
+        auto uk_1 = Policy::template GetUK_1<Problem>();
+        uk_1(d_res,
+             d_coords,
+             o_res,
+             o_coords,
+             o_flags,
+             smem,
+             kargs.hidden_size, // total n number
+             w_scale,
+             BlockShape::Block_Nr1 * kr_1 * BlockShape::Block_W1, // along N
+             BlockShape::Block_N1);                               // along N
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp
new file mode 100644
index 000000000..6089c2558
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// TODO: alow 2 gemm have different type
+template <typename ADataType_,
+          typename GDataType_,
+          typename DDataType_,
+          typename AccDataType_,
+          typename ODataType_,
+          typename AScaleDataType_,
+          typename GScaleDataType_,
+          typename DScaleDataType_,
+          typename YSmoothScaleDataType_,
+          typename TopkWeightDataType_,
+          typename IndexDataType_,  // data type for all indexing
+          typename GateActivation_, // = ck_tile::element_wise::Silu,
+          typename BlockShape_,     // shoule be FusedMoeGemmShape
+          typename Traits_>
+struct FusedMoeGemmPipelineProblem
+{
+    using ADataType            = remove_cvref_t<ADataType_>;
+    using GDataType            = remove_cvref_t<GDataType_>;
+    using DDataType            = remove_cvref_t<DDataType_>;
+    using AccDataType          = remove_cvref_t<AccDataType_>;
+    using ODataType            = remove_cvref_t<ODataType_>;
+    using AScaleDataType       = remove_cvref_t<AScaleDataType_>;
+    using GScaleDataType       = remove_cvref_t<GScaleDataType_>;
+    using DScaleDataType       = remove_cvref_t<DScaleDataType_>;
+    using YSmoothScaleDataType = remove_cvref_t<YSmoothScaleDataType_>;
+    using TopkWeightDataType   = remove_cvref_t<TopkWeightDataType_>;
+    using IndexDataType        = remove_cvref_t<IndexDataType_>;
+
+    // the input for next gemm should have same time as
+    using YDataType = ADataType;
+
+    using GateActivation = remove_cvref_t<GateActivation_>;
+    using BlockShape     = remove_cvref_t<BlockShape_>;
+    using Traits         = remove_cvref_t<Traits_>;
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
new file mode 100644
index 000000000..d7127b098
--- /dev/null
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+enum class FusedMoeGemmWeightPermuteEnum
+{
+    // permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6
+    // permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6
+    no_permute          = 0,
+    b_nr_kr_kw_nw_kv    = 1, // 0,1,3,4,2,5
+    b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
+};
+
+template <bool IsGateOnly_,
+          bool UseSmoothQuant_,
+          index_t OAtomic_, // 0-no atomic, 1-atomic-pk-f16/bf16, 2-atomic-f32
+          FusedMoeGemmWeightPermuteEnum PermuteEnum_ =
+              FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten,
+          bool PadHiddenSize_       = false,
+          bool PadIntermediateSize_ = false>
+struct FusedMoeGemmTraits
+{
+    // Gate+Up or Gate only
+    static constexpr bool IsGateOnly                           = IsGateOnly_;
+    static constexpr bool UseSmoothQuant                       = UseSmoothQuant_;
+    static constexpr index_t OAtomic                           = OAtomic_;
+    static constexpr FusedMoeGemmWeightPermuteEnum PermuteEnum = PermuteEnum_;
+    static constexpr bool PadHiddenSize                        = PadHiddenSize_;
+    static constexpr bool PadIntermediateSize                  = PadIntermediateSize_;
+};
+
+// Note: this need to be a bit mask
+enum class FusedMoeGemmPipelineSequencerEnum
+{
+    SLD_A = 1 << 0, // shared load a
+    SLD_B = 1 << 1,
+    GLD_A = 1 << 2, // global load a
+    GLD_B = 1 << 3,
+    SST_A = 1 << 4, // shared store a
+    SST_B = 1 << 5,
+    GST_O = 1 << 6, // global store out
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 7ca4a697a..89ea82c5b 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -10,114 +10,134 @@
 namespace ck_tile {
 
 // fp16
-using WarpGemmMfmaF16F16F32M32N32K8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8>>;
 
-using WarpGemmMfmaF16F16F32M16N16K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16>>;
+using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfmaF16F16F32M32N32K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 2>>;
+using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfmaF16F16F32M16N16K32 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplF16F16F32M16N16K16, 2>>;
+using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    2>>;
 
-using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 1>>;
+using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
+    2>>;
 
-using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 2>>;
+using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    1>>;
 
-using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplF16F16F32M32N32K8>>;
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    2>>;
 
-using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplF16F16F32M16N16K16>>;
+using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplF16F16F32M32N32K8,
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplF16F16F32M16N16K16,
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2>>;
 
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImplF16F16F32M32N32K8,
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
 // bf16
-using WarpGemmMfmaBf16Bf16F32M32N32K8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8>>;
 
-using WarpGemmMfmaBf16Bf16F32M16N16K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfmaBf16Bf16F32M32N32K16 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 2>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    2>>;
 
-using WarpGemmMfmaBf16Bf16F32M16N16K32 =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16, 2>>;
+using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
+    2>>;
 
-using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 1>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    1>>;
 
-using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = WarpGemmImpl<
-    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 2>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA =
+    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
+        2>>;
 
-using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16>>;
+using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8,
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16,
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2>>;
 
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8,
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
 // fp8
-using WarpGemmMfma_f32_32x32x16_fp8_fp8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8>>;
 
-using WarpGemmMfma_f32_32x32x16_fp8_bf8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x16_bf8_fp8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x16_bf8_bf8 =
-    WarpGemmImpl<WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8>>;
+using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = WarpGemmImpl<
-    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8>>;
+using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
 template <index_t swizzle_factor = 2>
 using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t>,
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, WGAttrCtlEnum::Default_>,
         2,
         swizzle_factor>>;
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index d80e5198e..0a8d2dfbe 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -25,6 +25,8 @@ struct WarpGemmAtrributeMfma
     static constexpr index_t kN = Impl::kN;
     static constexpr index_t kK = Impl::kK;
 
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
@@ -51,10 +53,13 @@ struct WarpGemmAtrributeMfma
         sequence<0, 2>>;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
-        Impl{}(c_vec, a_vec, b_vec);
+        Impl{}(c_vec, a_vec, b_vec, bool_constant<post_nop_>{});
     }
 
     // c_vec = a_vec * b_vec
@@ -85,6 +90,8 @@ struct WarpGemmAtrributeMfmaIterateK
     static constexpr index_t kN = Impl::kN;
     static constexpr index_t kK = Impl::kK * kKIter;
 
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
@@ -111,8 +118,11 @@ struct WarpGemmAtrributeMfmaIterateK
         sequence<0, 2>>;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
         using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
         using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -122,10 +132,33 @@ struct WarpGemmAtrributeMfmaIterateK
                    reinterpret_cast<const buf_a&>(a_vec)
                        .template get_as<typename Impl::AVecType>()[iKIter],
                    reinterpret_cast<const buf_b&>(b_vec)
-                       .template get_as<typename Impl::BVecType>()[iKIter]);
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
         });
     }
 
+    template <index_t iKIter, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
@@ -168,6 +201,8 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
     static constexpr index_t kN = Impl::kM;
     static constexpr index_t kK = Impl::kK;
 
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
@@ -194,11 +229,14 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
         sequence<0, 2>>;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
         // swap A and B
-        Impl{}(c_vec, b_vec, a_vec);
+        Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
     }
 
     // c_vec = a_vec * b_vec
@@ -226,6 +264,8 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
     static constexpr index_t kN = Impl::kM;
     static constexpr index_t kK = Impl::kK;
 
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
@@ -255,12 +295,15 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
         sequence<2, 2>,
         sequence<0, 2>>;
 
+    template <bool post_nop_ = false>
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
         // swap A and B
-        Impl{}(c_vec, b_vec, a_vec);
+        Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
     }
 
     // c_vec = a_vec * b_vec
@@ -291,6 +334,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
     static constexpr index_t kN = Impl::kM;
     static constexpr index_t kK = Impl::kK * kKIter;
 
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
@@ -316,9 +361,12 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
         sequence<2, 2>,
         sequence<0, 2>>;
 
+    template <bool post_nop_ = false>
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
         using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
         using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -328,10 +376,34 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
                    reinterpret_cast<const buf_b&>(b_vec)
                        .template get_as<typename Impl::BVecType>()[iKIter],
                    reinterpret_cast<const buf_a&>(a_vec)
-                       .template get_as<typename Impl::AVecType>()[iKIter]);
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
         });
     }
 
+    template <index_t iKIter, bool post_nop_ = false>
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+        // swap A and B, value and type
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
@@ -377,6 +449,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
     static constexpr index_t kK      = Impl::kK * kKIter;
     static constexpr index_t SFactor = SFactor_; // group how many CM1 together
 
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
@@ -429,8 +503,11 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
         sequence<0, 2>>;
 #endif
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
         using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
         using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -440,10 +517,33 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
                    reinterpret_cast<const buf_b&>(b_vec)
                        .template get_as<typename Impl::BVecType>()[iKIter],
                    reinterpret_cast<const buf_a&>(a_vec)
-                       .template get_as<typename Impl::AVecType>()[iKIter]);
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
         });
     }
 
+    template <index_t iKIter, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+        // swap A and B, value and type
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
@@ -488,6 +588,8 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
     static constexpr index_t kK      = Impl::kK * kKIter;
     static constexpr index_t SFactor = SFactor_; // group how many CM1 together
 
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kAMLane / (Impl::kCMLane * SFactor * Impl::kCM1PerLane),
@@ -518,8 +620,11 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
         sequence<0, 2>>;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
         using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
         using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
@@ -529,10 +634,33 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
                    reinterpret_cast<const buf_a&>(a_vec)
                        .template get_as<typename Impl::AVecType>()[iKIter],
                    reinterpret_cast<const buf_b&>(b_vec)
-                       .template get_as<typename Impl::BVecType>()[iKIter]);
+                       .template get_as<typename Impl::BVecType>()[iKIter],
+                   bool_constant<post_nop_>{});
         });
     }
 
+    template <index_t iKIter, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   number<iKIter>,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+
+        static_assert(iKIter < kKIter);
+
+        // static_for<0, kKIter, 1>{}([&](auto iKIter) {
+        Impl{}(c_vec,
+               reinterpret_cast<const buf_a&>(a_vec)
+                   .template get_as<typename Impl::AVecType>()[iKIter],
+               reinterpret_cast<const buf_b&>(b_vec)
+                   .template get_as<typename Impl::BVecType>()[iKIter],
+               bool_constant<post_nop_>{});
+        //});
+    }
+
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index bb59a7298..0aba1f535 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,12 +7,68 @@
 
 namespace ck_tile {
 
+// TODO: refactor warp-gemm
+// currently there is a discrepency for vav/vva if we need transpose C/D
+// e.g. if we want A:agpr, B:vgpr, we have to use vva in WGAttrEnum
+// because we swap the A/B pointer in _impl code (but not known this info here)
+enum class WGAttrCtlEnum
+{
+    Default_ = 0,
+    Raw_vvv  = 1, // c-vgpr, a-vgpr, b-vgpr
+    Raw_vaa  = 2, // c-vgpr, a-agpr, b-agpr
+    Raw_vav  = 3, // c-vgpr, a-agpr, b-vgpr
+    Raw_vva  = 4, // c-vgpr, a-vgpr, b-agpr
+    Raw_avv  = 5, // c-agpr, a-vgpr, b-vgpr
+    // raw_a_a_a = 3,  // c-agpr, a-agpr, b-agpr
+};
+
+#define DISPATCH_MFMA_(mfma_, dmod_, amod_, bmod_, cmod_)       \
+    if constexpr(post_nop_)                                     \
+    {                                                           \
+        asm volatile(mfma_ " %0, %1, %2, %3 ; yyy\n"            \
+                           "s_nop 3"                            \
+                     : dmod_(c_vec)                             \
+                     : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \
+                     :);                                        \
+    }                                                           \
+    else                                                        \
+    {                                                           \
+        asm volatile(mfma_ " %0, %1, %2, %3\n"                  \
+                     : dmod_(c_vec)                             \
+                     : amod_(a_vec), bmod_(b_vec), cmod_(c_vec) \
+                     :);                                        \
+    }
+
+#define DISPATCH_MFMA_CTRL_(mfma_, ctrl_)              \
+    if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vvv)      \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "v", "v", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vaa) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "a", "a", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vav) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "a", "v", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_vva) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+v", "v", "a", "v")     \
+    }                                                  \
+    else if constexpr(ctrl_ == WGAttrCtlEnum::Raw_avv) \
+    {                                                  \
+        DISPATCH_MFMA_(mfma_, "+a", "v", "v", "a")     \
+    }
+
 // FP16
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
 {
-    using ADataType = fp16_t;
-    using BDataType = fp16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
 
     using AVecType = ext_vector_t<fp16_t, 4>;
     using BVecType = ext_vector_t<fp16_t, 4>;
@@ -33,16 +89,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
     static constexpr index_t kCM1PerLane = 4;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8f16", Ctrl)
+        else
+        {
 #if defined(__gfx9__)
-        c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0);
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
     }
 
     // c_vec = a_vec * b_vec
@@ -52,18 +115,20 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
         return bit_cast<CVecType>(
             __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0));
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
 {
-    using ADataType = fp16_t;
-    using BDataType = fp16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
 
     using AVecType = ext_vector_t<fp16_t, 4>;
     using BVecType = ext_vector_t<fp16_t, 4>;
@@ -84,16 +149,23 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
     static constexpr index_t kCM1PerLane = 4;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16f16", Ctrl)
+        else
+        {
 #if defined(__gfx9__)
-        c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0);
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
     }
 
     // c_vec = a_vec * b_vec
@@ -103,19 +175,21 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
         return bit_cast<CVecType>(
             __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
 };
 
 // Bf16
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
 {
-    using ADataType = bf16_t;
-    using BDataType = bf16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
 
     using AVecType = ext_vector_t<bf16_t, 4>;
     using BVecType = ext_vector_t<bf16_t, 4>;
@@ -136,28 +210,35 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
     static constexpr index_t kCM1PerLane = 4;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x8bf16_1k", Ctrl)
+        else
+        {
 #if defined(__gfx90a__) || defined(__gfx94__)
-        c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
 #elif defined(__gfx908__)
-        static_for<0, 2, 1>{}([&](auto k) {
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
-                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                c_vec,
-                0,
-                0,
-                0);
-        });
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
     }
 
     // c_vec = a_vec * b_vec
@@ -181,18 +262,20 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
         });
         return c_vec;
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
 {
-    using ADataType = bf16_t;
-    using BDataType = bf16_t;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
 
     using AVecType = ext_vector_t<bf16_t, 4>;
     using BVecType = ext_vector_t<bf16_t, 4>;
@@ -213,28 +296,34 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
     static constexpr index_t kCM1PerLane = 4;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x16bf16_1k", Ctrl)
+        {
 #if defined(__gfx90a__) || defined(__gfx94__)
-        c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
 #elif defined(__gfx908__)
-        static_for<0, 2, 1>{}([&](auto k) {
-            c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16(
-                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
-                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
-                c_vec,
-                0,
-                0,
-                0);
-        });
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x8bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
     }
 
     // c_vec = a_vec * b_vec
@@ -258,20 +347,21 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
         });
         return c_vec;
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
 };
 
 // FP8
-template <typename AType_, typename BType_>
+template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 {
-    using ADataType = AType_;
-    using BDataType = BType_;
-    using CDataType = float;
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = AType_;
+    using BDataType                     = BType_;
+    using CDataType                     = float;
 
     using AVecType = ext_vector_t<ADataType, 8>;
     using BVecType = ext_vector_t<BDataType, 8>;
@@ -292,38 +382,120 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
     static constexpr index_t kCM1PerLane = 4;
 
     // c_vec += a_vec * b_vec
-    CK_TILE_DEVICE void
-    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
     {
+        if constexpr(Ctrl == WGAttrCtlEnum::Raw_vvv)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vaa)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "a", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vav)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "a", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vva)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_fp8_bf8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_32x32x16_bf8_bf8", "+v", "v", "a", "v")
+            }
+        }
+        else
+        {
 #if defined(__gfx94__)
-        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
 #elif defined(__gfx908__) || defined(__gfx90a__)
-        static_for<0, 8, 1>{}([&](auto k) {
-            float a_f32 =
-                type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
-                                        .template get_as<ADataType>()[number<k>{}]);
-            float b_f32 =
-                type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
-                                        .template get_as<BDataType>()[number<k>{}]);
-
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
-        });
+            static_for<0, 8, 1>{}([&](auto k) {
+                float a_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                                            .template get_as<ADataType>()[number<k>{}]);
+                float b_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                                            .template get_as<BDataType>()[number<k>{}]);
+
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
+            });
 #else
-        ignore = c_vec;
-        ignore = a_vec;
-        ignore = b_vec;
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
 #endif
+        }
     }
 
     // c_vec = a_vec * b_vec
@@ -356,20 +528,97 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
         });
         return c_vec;
 #else
-        ignore = a_vec;
-        ignore = b_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
         return CVecType{0.f};
 #endif
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t, Ctrl_>;
+
+// int8
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x16_i8", Ctrl)
+        else
+        {
+#if defined(__gfx94__)
+            c_vec = __builtin_amdgcn_mfma_i32_32x32x8i8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#elif defined(__gfx908__) || defined(__gfx90a__)
+            static_for<0, 8, 1>{}([&](auto k) {
+                float a_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                                            .template get_as<ADataType>()[number<k>{}]);
+                float b_f32 =
+                    type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                                            .template get_as<BDataType>()[number<k>{}]);
+
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
+            });
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
+#undef DISPATCH_MFMA_
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 4183d9cb9..99cd5d787 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,40 +21,40 @@ struct WarpGemmMfmaDispatcher;
 
 // clang-format off
 // fp16
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
 
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<half_t, half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
 
 // bf16
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
 
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<bf16_t, bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
 
 // fp8
-template<> struct WarpGemmMfmaDispatcher<fp8_t, fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<fp8_t, fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<fp8_t, bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<fp8_t, bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<bf8_t, bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
 // clang-format on
 } // namespace impl
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
index eb9dbf127..182d023a0 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
@@ -31,11 +31,21 @@ struct WarpGemmImpl
     using BWarpTensor = static_distributed_tensor<BDataType, BWarpDstr>;
     using CWarpTensor = static_distributed_tensor<CDataType, CWarpDstr>;
 
-    CK_TILE_DEVICE void operator()(CWarpTensor& c, const AWarpTensor& a, const BWarpTensor& b) const
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access()
     {
-        using AVec = ext_vector_t<ADataType, AWarpTensor::get_thread_buffer_size()>;
-        using BVec = ext_vector_t<BDataType, BWarpTensor::get_thread_buffer_size()>;
-        using CVec = ext_vector_t<CDataType, CWarpTensor::get_thread_buffer_size()>;
+        return WarpGemmAttribute_::get_num_of_access();
+    }
+
+    template <typename CTensor, typename ATensor, typename BTensor, bool post_nop_ = false>
+    CK_TILE_DEVICE void
+    operator()(CTensor& c, const ATensor& a, const BTensor& b, bool_constant<post_nop_> = {}) const
+    {
+        static_assert(detail::is_similiar_distributed_tensor_v<CTensor, CWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<ATensor, AWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<BTensor, BWarpTensor>);
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;
 
         constexpr auto I0 = number<0>{};
 
@@ -44,18 +54,49 @@ struct WarpGemmImpl
         auto c_vec       = c.get_thread_buffer().template get_as<CVec>()[I0];
 
         // c_vec += a_vec * b_vec
-        WarpGemmAttribute{}(c_vec, a_vec, b_vec);
+        WarpGemmAttribute{}(c_vec, a_vec, b_vec, bool_constant<post_nop_>{});
 
         c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
     }
 
-    CK_TILE_DEVICE auto operator()(const AWarpTensor& a, const BWarpTensor& b) const
+    template <typename CTensor,
+              typename ATensor,
+              typename BTensor,
+              index_t i_subk,
+              bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CTensor& c,
+                                   const ATensor& a,
+                                   const BTensor& b,
+                                   number<i_subk>,
+                                   bool_constant<post_nop_> = {}) const
     {
-        CWarpTensor c;
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;
+
+        constexpr auto I0 = number<0>{};
 
-        using AVec = ext_vector_t<ADataType, AWarpTensor::get_thread_buffer_size()>;
-        using BVec = ext_vector_t<BDataType, BWarpTensor::get_thread_buffer_size()>;
-        using CVec = ext_vector_t<CDataType, CWarpTensor::get_thread_buffer_size()>;
+        const auto a_vec = a.get_thread_buffer().template get_as<AVec>()[I0];
+        const auto b_vec = b.get_thread_buffer().template get_as<BVec>()[I0];
+        auto c_vec       = c.get_thread_buffer().template get_as<CVec>()[I0];
+
+        // c_vec += a_vec * b_vec
+        WarpGemmAttribute{}(c_vec, a_vec, b_vec, number<i_subk>{}, bool_constant<post_nop_>{});
+
+        c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
+    }
+
+    template <typename ATensor, typename BTensor>
+    CK_TILE_DEVICE auto operator()(const ATensor& a, const BTensor& b) const
+    {
+        using CTensor = CWarpTensor;
+        static_assert(detail::is_similiar_distributed_tensor_v<ATensor, AWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<BTensor, BWarpTensor>);
+        CTensor c;
+
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;
 
         constexpr auto I0 = number<0>{};
 
diff --git a/include/ck_tile/ops/moe_sorting.hpp b/include/ck_tile/ops/moe_sorting.hpp
deleted file mode 100644
index b74607f06..000000000
--- a/include/ck_tile/ops/moe_sorting.hpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp"
-#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
-#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
-#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp"
-#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
-#include "ck_tile/ops/common/tensor_layout.hpp"
-- 
GitLab


From b6bcd76d881421af2f04246b1e4bbac45b7ce3b9 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Tue, 26 Nov 2024 08:45:14 +0100
Subject: [PATCH 079/153] CK-Tile first draft of universal block gemm with
 interwave & intrawave scheduler (#1676)

* Block universal gemm.

* Universal block gemm with interwave scheduler - draft.

* Refactoring

* Move a/b_warp_tiles into BlockGemmImpl
* set BlockGemmImpl as a class member

* Change tile size for more suitable to memory bound cases.

* Introduce kKPerThread to WarpGemm

* Add documentation comment.

* Fix Interwave scheduler block gemm.

* Add compute/memory friendly tile configuration.

* Clean

* New tile configurations in gemm mem example.

* Add more static checks and fix loop order in block gemm.

* Add more static checks and use warp gemm mfma dispatcher.

* Add default scheduler block gemm.

* Remove logging in example.
---
 example/01_gemm/run_gemm_example_v2.inc       |   2 +-
 example/ck_tile/03_gemm/gemm_mem_pipeline.cpp |  33 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  22 +-
 include/ck_tile/ops/gemm.hpp                  |   1 +
 .../block/block_universal_gemm_as_bs_cr.hpp   | 661 ++++++++++++++++++
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |  12 +-
 .../gemm_pipeline_ag_bg_cr_scheduler.hpp      |   2 +
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |  40 +-
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   2 +
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    |  55 +-
 .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp  |   7 +-
 11 files changed, 780 insertions(+), 57 deletions(-)
 create mode 100644 include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp

diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc
index 71524fdec..5b6969f1d 100644
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -261,7 +261,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     if(config.time_kernel)
     {
         ave_time =
-            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4});
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 50, 100, true, 4});
 
         std::size_t flop = 2_uz * M * N * K;
         std::size_t num_btype =
diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
index ff9d8bad3..97d150412 100644
--- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
+++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
@@ -17,9 +17,24 @@
 template <typename ALayout, typename BLayout, typename CLayout>
 float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 {
-    // ToDo: This will be modified by the codegen code later.
+#if 1
+    // Memory friendly for Interwave scheduler
     constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 32;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 4;
+    constexpr ck_tile::index_t N_Warp = 1;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+#else
+    // Compute friendly for Intrawave scheduler
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
     constexpr ck_tile::index_t K_Tile = 32;
 
     constexpr ck_tile::index_t M_Warp = 2;
@@ -28,12 +43,12 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 
     constexpr ck_tile::index_t M_Warp_Tile = 32;
     constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+#endif
 
-    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = true;
-    constexpr bool kPadN = true;
-    constexpr bool kPadK = true;
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
 
     constexpr int kBlockPerCu = 1;
 
@@ -174,8 +189,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
         {
             std::ostringstream err;
             err << "When there's no hot loop, this tail number \"" << tail_num
-                << "\" is not supported! " << __FILE__ << ":" << __LINE__
-                << ", in function: " << __func__;
+                << "\" is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
             throw std::runtime_error(err.str());
         }
     }
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 8db131738..5199c1e3e 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -31,15 +31,13 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     float ave_time = gemm_calc<ALayout, BLayout, CLayout>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
-    std::string op_name{"Gemm{MemBoundPipeline}"};
-
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
         sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K
+    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
               << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
               << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
               << std::endl;
@@ -114,7 +112,6 @@ int run_gemm_example_with_layouts(int argc,
         f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
 
     // TODO: add different init types
-
     ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
     ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
 
@@ -202,14 +199,15 @@ int run_gemm_example(int argc, char* argv[])
     {
         return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
     }
-    else if(a_layout == "C" && b_layout == "C")
-    {
-        return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
-    }
-    else if(a_layout == "C" && b_layout == "R")
-    {
-        return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{});
-    }
+    // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not
+    // work. else if(a_layout == "C" && b_layout == "C")
+    // {
+    //     return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
+    // }
+    // else if(a_layout == "C" && b_layout == "R")
+    // {
+    //     return run_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{});
+    // }
     else
     {
         throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index ac74782a3..9a033ee2d 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -22,6 +22,7 @@
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
+#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
new file mode 100644
index 000000000..5f98a7a0b
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+
+namespace ck_tile {
+
+// A is block window on shared memory
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename Policy_ = BlockGemmASmemBSmemCRegV1DefaultPolicy>
+struct BlockUniversalGemmAsBsCr
+{
+    private:
+    // TODO: This should be in Policy - UniversalGemmPolicyBase ?
+    template <typename PipelineProblem_, typename GemmPolicy_>
+    struct GemmTraits_
+    {
+        using Problem        = remove_cvref_t<PipelineProblem_>;
+        using Policy         = remove_cvref_t<GemmPolicy_>;
+        using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+        using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+        using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+        static constexpr index_t kBlockSize = Problem::kBlockSize;
+        static constexpr auto Scheduler     = Problem::Scheduler;
+
+        static constexpr index_t MPerBlock = BlockGemmShape::kM;
+        static constexpr index_t NPerBlock = BlockGemmShape::kN;
+        static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+        static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WarpGemm = remove_cvref_t<decltype(config.template at<0>())>;
+
+        static constexpr index_t MWarp = config.template at<1>();
+        static constexpr index_t NWarp = config.template at<2>();
+
+        static_assert(MWarp == BlockGemmShape::BlockWarps::at(number<0>{}),
+                      "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!");
+        static_assert(NWarp == BlockGemmShape::BlockWarps::at(number<1>{}),
+                      "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!");
+        static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(number<0>{}),
+                      "Error! WarpGemm's M is not consisten with BlockGemmShape!");
+        static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(number<1>{}),
+                      "Error! WarpGemm's N is not consisten with BlockGemmShape!");
+
+        static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+        static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
+
+        static_assert(MIterPerWarp * MWarp * WarpGemm::kM == MPerBlock,
+                      "Error! Warps should cover all Block tile!");
+        static_assert(NIterPerWarp * NWarp * WarpGemm::kN == NPerBlock,
+                      "Error! Warps should cover all Block tile!");
+
+        static constexpr index_t MPerBlockPerIter = MWarp * WarpGemm::kM;
+        static constexpr index_t NPerBlockPerIter = NWarp * WarpGemm::kN;
+        static constexpr index_t KPerBlockPerIter = WarpGemm::kK;
+
+        using AWarpTileDistr = remove_cvref_t<decltype(make_static_tile_distribution(
+            typename WarpGemm::AWarpDstrEncoding{}))>;
+        using BWarpTileDistr = remove_cvref_t<decltype(make_static_tile_distribution(
+            typename WarpGemm::BWarpDstrEncoding{}))>;
+
+        using AWarpTile =
+            remove_cvref_t<decltype(make_static_distributed_tensor<ADataType>(AWarpTileDistr{}))>;
+        using BWarpTile =
+            remove_cvref_t<decltype(make_static_distributed_tensor<BDataType>(BWarpTileDistr{}))>;
+
+        // TODO: Should we have two policies? Interwave & Intrawave ??
+        static constexpr index_t InterWaveSchedulingMacClusters = 1;
+
+        static constexpr index_t KPack      = WarpGemm::kKPerThread;
+        static constexpr index_t KPerThread = KPerBlock / WarpGemm::kK * KPack;
+        static constexpr index_t KRepeat    = KPerThread / KPack;
+    };
+
+    public:
+    using Traits = GemmTraits_<Problem_, Policy_>;
+
+    using ADataType = remove_cvref_t<typename Traits::ADataType>;
+    using BDataType = remove_cvref_t<typename Traits::BDataType>;
+    using CDataType = remove_cvref_t<typename Traits::CDataType>;
+
+    using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
+
+    static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
+    static constexpr index_t MIterPerWarp = Traits::MIterPerWarp;
+    static constexpr index_t NIterPerWarp = Traits::NIterPerWarp;
+
+    static constexpr index_t MWarp = Traits::MWarp;
+    static constexpr index_t NWarp = Traits::NWarp;
+
+    static constexpr auto Scheduler = Traits::Scheduler;
+
+    private:
+    template <GemmPipelineScheduler Scheduler, typename GemmTraits>
+    struct BlockGemmImpl
+    {
+    };
+
+    template <typename GemmTraits>
+    struct BlockGemmImpl<GemmPipelineScheduler::Default, GemmTraits>
+    {
+        // C += A * B
+        template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                       const ASmemBlockWindow& a_block_window,
+                                       const BSmemBlockWindow& b_block_window)
+        {
+            static_assert(
+                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
+                "The CDataType as defined in traits should be the same as correspoinding "
+                "C block tensor data type!");
+            static_assert(std::is_same_v<typename GemmTraits::ADataType,
+                                         typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<typename GemmTraits::BDataType,
+                                             typename BSmemBlockWindow::DataType>,
+                          "The ADataType and BDataType as defined in "
+                          "traits should be the same as correspoinding block window data type!");
+
+            static_assert(
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                "MPerBlock, NPerBlock, KPerBlock defined in "
+                " BlockGemmShape are different from A/B block smem windows apropriate dims!");
+
+            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+
+            // TODO: refactor warp_window tile type to class member as it should be
+            // compile-time known information.
+            auto a_warp_window_tmp = make_tile_window(
+                a_block_window.get_bottom_tensor_view(),
+                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
+                a_block_window.get_window_origin() +
+                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0},
+                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+
+            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
+
+            static_assert(GemmTraits::AWarpTile::get_num_of_dimension() ==
+                              AWarpWindow::get_num_of_dimension(),
+                          "AWarpWindow number of dimensions must be equal to "
+                          "AWarpTile number of dimensions!");
+            static_assert(GemmTraits::AWarpTile::get_lengths() ==
+                              AWarpWindow{}.get_window_lengths(),
+                          "AWarpWindow lengths must be equal to AWarpTile lengths!");
+
+            statically_indexed_array<
+                statically_indexed_array<AWarpWindow, GemmTraits::KIterPerWarp>,
+                GemmTraits::MIterPerWarp>
+                a_warp_windows;
+
+            // construct B-warp-window
+            auto b_warp_window_tmp = make_tile_window(
+                b_block_window.get_bottom_tensor_view(),
+                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
+                b_block_window.get_window_origin() +
+                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0},
+                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+
+            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
+
+            static_assert(GemmTraits::BWarpTile::get_num_of_dimension() ==
+                              BWarpWindow::get_num_of_dimension(),
+                          "BWarpWindow number of dimensions must be equal to "
+                          "BWarpTile number of dimensions!");
+            static_assert(GemmTraits::BWarpTile::get_lengths() ==
+                              BWarpWindow{}.get_window_lengths(),
+                          "BWarpWindow lengths must be equal to BWarpTile lengths!");
+
+            statically_indexed_array<
+                statically_indexed_array<BWarpWindow, GemmTraits::KIterPerWarp>,
+                GemmTraits::NIterPerWarp>
+                b_warp_windows;
+
+            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                    // TODO: I don't have to move 0,0 window!
+                    move_tile_window(a_warp_windows(mIter)(kIter),
+                                     {mIter * GemmTraits::MPerBlockPerIter,
+                                      kIter * GemmTraits::KPerBlockPerIter});
+                });
+            });
+
+            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+
+                    move_tile_window(b_warp_windows(nIter)(kIter),
+                                     {nIter * GemmTraits::NPerBlockPerIter,
+                                      kIter * GemmTraits::KPerBlockPerIter});
+                });
+            });
+
+            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
+            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+
+            constexpr auto c_warp_y_lengths =
+                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+            // hot loop:
+            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                    const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));
+
+                    static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                        const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));
+
+                        // read C warp tensor from C block tensor-
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        typename GemmTraits::WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile);
+
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+        }
+    };
+
+    template <typename GemmTraits>
+    struct BlockGemmImpl<GemmPipelineScheduler::Intrawave, GemmTraits>
+    {
+        statically_indexed_array<
+            statically_indexed_array<typename GemmTraits::AWarpTile, GemmTraits::KIterPerWarp>,
+            GemmTraits::MIterPerWarp>
+            a_warp_tiles_;
+
+        statically_indexed_array<
+            statically_indexed_array<typename GemmTraits::BWarpTile, GemmTraits::KIterPerWarp>,
+            GemmTraits::NIterPerWarp>
+            b_warp_tiles_;
+
+        template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+        CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
+                                          const BSmemBlockWindow& b_block_window)
+        {
+            static_assert(
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                "MPerBlock, NPerBlock, KPerBlock defined in "
+                " BlockGemmShape are different from A/B block smem windows apropriate dims!");
+
+            static_assert(std::is_same_v<typename GemmTraits::ADataType,
+                                         typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<typename GemmTraits::BDataType,
+                                             typename BSmemBlockWindow::DataType>,
+                          "The ADataType and BDataType as defined in "
+                          "traits should be the same as correspoinding block window data type!");
+
+            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+
+            // TODO: refactor warp_window tile type to class member as it should be
+            // compile-time known information.
+            auto a_warp_window_tmp = make_tile_window(
+                a_block_window.get_bottom_tensor_view(),
+                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
+                a_block_window.get_window_origin() +
+                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0},
+                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+
+            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
+
+            static_assert(GemmTraits::AWarpTile::get_num_of_dimension() ==
+                              AWarpWindow::get_num_of_dimension(),
+                          "AWarpWindow number of dimensions must be equal to "
+                          "AWarpTile number of dimensions!");
+            static_assert(GemmTraits::AWarpTile::get_lengths() ==
+                              AWarpWindow{}.get_window_lengths(),
+                          "AWarpWindow lengths must be equal to AWarpTile lengths!");
+
+            statically_indexed_array<
+                statically_indexed_array<AWarpWindow, GemmTraits::KIterPerWarp>,
+                GemmTraits::MIterPerWarp>
+                a_warp_windows;
+
+            // construct B-warp-window
+            auto b_warp_window_tmp = make_tile_window(
+                b_block_window.get_bottom_tensor_view(),
+                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
+                b_block_window.get_window_origin() +
+                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0},
+                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+
+            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
+
+            static_assert(GemmTraits::BWarpTile::get_num_of_dimension() ==
+                              BWarpWindow::get_num_of_dimension(),
+                          "BWarpWindow number of dimensions must be equal to "
+                          "BWarpTile number of dimensions!");
+            static_assert(GemmTraits::BWarpTile::get_lengths() ==
+                              BWarpWindow{}.get_window_lengths(),
+                          "BWarpWindow lengths must be equal to BWarpTile lengths!");
+
+            statically_indexed_array<
+                statically_indexed_array<BWarpWindow, GemmTraits::KIterPerWarp>,
+                GemmTraits::NIterPerWarp>
+                b_warp_windows;
+
+            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                    // TODO: I don't have to move 0,0 window!
+                    move_tile_window(a_warp_windows(mIter)(kIter),
+                                     {mIter * GemmTraits::MPerBlockPerIter,
+                                      kIter * GemmTraits::KPerBlockPerIter});
+                });
+            });
+
+            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+
+                    move_tile_window(b_warp_windows(nIter)(kIter),
+                                     {nIter * GemmTraits::NPerBlockPerIter,
+                                      kIter * GemmTraits::KPerBlockPerIter});
+                });
+            });
+
+            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                    // read A warp tensor from A block window
+                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
+                });
+                static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B Block window
+                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
+                });
+            });
+        }
+
+        // C += A * B
+        template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                       [[maybe_unused]] const ASmemBlockWindow& a_block_window,
+                                       [[maybe_unused]] const BSmemBlockWindow& b_block_window)
+        {
+            static_assert(
+                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
+                "The CDataType as defined in traits should be the same as correspoinding "
+                "C block tensor data type!");
+
+            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
+            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+
+            constexpr auto c_warp_y_lengths =
+                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+            // hot loop:
+            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                    static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read C warp tensor from C block tensor-
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        typename GemmTraits::WarpGemm{}(c_warp_tensor,
+                                                        a_warp_tiles_[mIter][kIter],
+                                                        b_warp_tiles_[nIter][kIter]);
+
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+        }
+    };
+
+    template <typename GemmTraits>
+    struct BlockGemmImpl<GemmPipelineScheduler::Interwave, GemmTraits>
+    {
+        static constexpr index_t KPerThread     = GemmTraits::KPerThread;
+        static constexpr index_t NumMacClusters = GemmTraits::InterWaveSchedulingMacClusters;
+        static constexpr index_t KPerInnerLoop =
+            ck_tile::max(KPerThread / NumMacClusters, GemmTraits::KPack);
+        // TODO: do we really need this?? Are there any cases when this would be >=1 ??
+        // Would we need InterWaveSchedulingMacClusters > 1 ???
+        static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
+        static constexpr index_t KInnerLoopIter = KPerInnerLoop / GemmTraits::KPack;
+
+        statically_indexed_array<
+            statically_indexed_array<typename GemmTraits::AWarpTile, KInnerLoopIter>,
+            GemmTraits::MIterPerWarp>
+            a_warp_tiles_;
+
+        statically_indexed_array<
+            statically_indexed_array<typename GemmTraits::BWarpTile, KInnerLoopIter>,
+            GemmTraits::NIterPerWarp>
+            b_warp_tiles_;
+
+        template <index_t KIdx, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
+                                          const BSmemBlockWindow& b_block_window)
+        {
+            static_assert(
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                "MPerBlock, NPerBlock, KPerBlock defined in "
+                " BlockGemmShape are different from A/B block smem windows apropriate dims!");
+
+            static_assert(std::is_same_v<typename GemmTraits::ADataType,
+                                         typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<typename GemmTraits::BDataType,
+                                             typename BSmemBlockWindow::DataType>,
+                          "The ADataType and BDataType as defined in "
+                          "traits should be the same as correspoinding block window data type!");
+
+            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+
+            // TODO: refactor warp_window tile type to class member as it should be
+            // compile-time known information.
+            auto a_warp_window_tmp = make_tile_window(
+                a_block_window.get_bottom_tensor_view(),
+                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
+                a_block_window.get_window_origin() +
+                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, KIdx * KPerInnerLoop},
+                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+
+            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
+
+            static_assert(GemmTraits::AWarpTile::get_num_of_dimension() ==
+                              AWarpWindow::get_num_of_dimension(),
+                          "AWarpWindow number of dimensions must be equal to "
+                          "AWarpTile number of dimensions!");
+            static_assert(GemmTraits::AWarpTile::get_lengths() ==
+                              AWarpWindow{}.get_window_lengths(),
+                          "AWarpWindow lengths must be equal to AWarpTile lengths!");
+
+            statically_indexed_array<statically_indexed_array<AWarpWindow, KInnerLoopIter>,
+                                     GemmTraits::MIterPerWarp>
+                a_warp_windows;
+
+            // construct B-warp-window
+            auto b_warp_window_tmp = make_tile_window(
+                b_block_window.get_bottom_tensor_view(),
+                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
+                b_block_window.get_window_origin() +
+                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, KIdx * KPerInnerLoop},
+                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+
+            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
+
+            static_assert(GemmTraits::BWarpTile::get_num_of_dimension() ==
+                              BWarpWindow::get_num_of_dimension(),
+                          "BWarpWindow number of dimensions must be equal to "
+                          "BWarpTile number of dimensions!");
+            static_assert(GemmTraits::BWarpTile::get_lengths() ==
+                              BWarpWindow{}.get_window_lengths(),
+                          "BWarpWindow lengths must be equal to BWarpTile lengths!");
+
+            statically_indexed_array<statically_indexed_array<BWarpWindow, KInnerLoopIter>,
+                                     GemmTraits::NIterPerWarp>
+                b_warp_windows;
+
+            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
+                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                    move_tile_window(a_warp_windows(mIter)(kIter),
+                                     {mIter * GemmTraits::MPerBlockPerIter,
+                                      kIter * GemmTraits::KPerBlockPerIter});
+                });
+            });
+
+            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
+                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+
+                    move_tile_window(b_warp_windows(nIter)(kIter),
+                                     {nIter * GemmTraits::NPerBlockPerIter,
+                                      kIter * GemmTraits::KPerBlockPerIter});
+                });
+            });
+
+            // TODO check if a_warp_tiles has same desc as a_warp_window
+            static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
+                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                    // read A warp tensor from A block window
+                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
+                });
+                static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B Block window
+                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
+                });
+            });
+        }
+
+        // C += A * B
+        template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                       const ASmemBlockWindow& a_block_window,
+                                       const BSmemBlockWindow& b_block_window)
+        {
+            static_assert(
+                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
+                "The CDataType as defined in traits should be the same as correspoinding "
+                "C block tensor data type!");
+
+            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
+            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+
+            constexpr auto c_warp_y_lengths =
+                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+            // hot loop:
+            static_for<0, KRepeat, 1>{}([&](auto kIter) {
+                LocalPrefetch<kIter.value>(a_block_window, b_block_window);
+                __builtin_amdgcn_sched_barrier(0);
+                // NOTE: Synchronize threads in a workgroup at the start of each MAC
+                // cluster, but except the first, as we can shorten non-MAC cluster a bit
+                // and there's no observable negative impact. The desired effect is waves in
+                // a workgroup executing MAC in sync. This avoids some out-of-sync waves
+                // hijacking MAC resource from other workgroups and reducing the chance of
+                // latency hiding by waiting for the rest of the workgroup at the eventual
+                // sync point.
+                if constexpr(kIter.value != 0 || KRepeat == 1)
+                {
+                    __builtin_amdgcn_s_barrier();
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+
+                static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) {
+                    static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                        static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                            // read C warp tensor from C block tensor-
+                            CWarpTensor c_warp_tensor;
+
+                            c_warp_tensor.get_thread_buffer() =
+                                c_block_tensor.get_y_sliced_thread_data(
+                                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                            // The block_sync_lds() here performs double duty:
+                            // A) safeguard against data hazard because barrier from
+                            // blockwise_gemm is moved here B) reduce VMEM FIFO congestion
+                            // by applying small delays to different wavefronts It is
+                            // performed near the end of MAC cluster to minimize lgkmcnt
+                            // penalty
+                            if constexpr(kIter.value == KRepeat - 1 &&
+                                         kInnerIter.value == KInnerLoopIter - 1 &&
+                                         mIter.value == GemmTraits::MIterPerWarp - 1 &&
+                                         nIter.value == GemmTraits::NIterPerWarp - 1)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                block_sync_lds();
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                            // warp GEMM
+                            typename GemmTraits::WarpGemm{}(c_warp_tensor,
+                                                            a_warp_tiles_[mIter][kInnerIter],
+                                                            b_warp_tiles_[nIter][kInnerIter]);
+
+                            // write C warp tensor into C block tensor
+                            c_block_tensor.set_y_sliced_thread_data(
+                                merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                                merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                                c_warp_tensor.get_thread_buffer());
+
+                            if constexpr(kInnerIter.value == 0 && mIter.value == 0 &&
+                                         nIter.value == 0)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                __builtin_amdgcn_s_setprio(1);
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                        });
+                    });
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_s_setprio(0);
+                __builtin_amdgcn_sched_barrier(0);
+            });
+        }
+    };
+
+    public:
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+
+        return c_block_tensor;
+    }
+
+    template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+    CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
+                                      const BSmemBlockWindow& b_block_window)
+    {
+        block_gemm_impl_.template LocalPrefetch(a_block_window, b_block_window);
+    }
+
+    // C += A * B
+    template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ASmemBlockWindow& a_block_window,
+                                   const BSmemBlockWindow& b_block_window)
+    {
+        block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window);
+    }
+
+    // C = A * B
+    template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+    CK_TILE_DEVICE auto operator()(const ASmemBlockWindow& a_block_window,
+                                   const BSmemBlockWindow& b_block_window)
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window);
+        return c_block_tensor;
+    }
+
+    private:
+    BlockGemmImpl<Scheduler, Traits> block_gemm_impl_{};
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index 85c5c5805..4634e9dcb 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -247,8 +247,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
 
             // Block GEMM
-            constexpr auto block_gemm = BlockGemm();
-            auto c_block_tile         = block_gemm.MakeCBlockTile();
+            auto block_gemm   = BlockGemm();
+            auto c_block_tile = block_gemm.MakeCBlockTile();
 
             using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
             using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
@@ -290,7 +290,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 {
                     static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) {
                         block_sync_lds();
-                        // block_gemm.LocalPrefetch();
+                        block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
                         block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
 
                         block_sync_lds();
@@ -318,7 +318,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 static_for<1, tail_num, 1>{}([&](auto prefetch_idx) {
                     block_sync_lds();
 
-                    // block_gemm.LocalPrefetch();
+                    block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
                     block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
 
                     block_sync_lds();
@@ -331,14 +331,14 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 });
 
                 block_sync_lds();
-                // block_gemm.LocalPrefetch();
+                block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
                 block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
             };
 
             if constexpr(TailNum == TailNumber::One)
             {
                 block_sync_lds();
-                // block_gemm.LocalPrefetch();
+                block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
                 block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
             }
             else if constexpr(TailNum == TailNumber::Two)
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
index 5e93ca21c..6f51e6b8a 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
@@ -11,6 +11,7 @@ namespace ck_tile {
 
 enum struct GemmPipelineScheduler
 {
+    Default,
     Intrawave,
     Interwave,
 };
@@ -43,6 +44,7 @@ inline std::ostream& operator<<(std::ostream& os, const ck_tile::GemmPipelineSch
 {
     switch(s)
     {
+    case ck_tile::GemmPipelineScheduler::Default: os << "Default"; break;
     case ck_tile::GemmPipelineScheduler::Intrawave: os << "Intrawave"; break;
     case ck_tile::GemmPipelineScheduler::Interwave: os << "Interwave"; break;
     default: os << "";
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index c765b3ce9..b475ebb7b 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 
 namespace ck_tile {
 
@@ -52,6 +53,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
         constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
 
+        // TODO: this 8 is AK1! should be a policy parameter!
         constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kKPerBlock / 8>{}, number<kMPerBlock>{}, number<8>{}),
             make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
@@ -264,6 +266,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
                 static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
                 static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
                 constexpr index_t M0 = MPerBlock / (M2 * M1);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M2, M1 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
 
                 return make_static_tile_distribution(
                     tile_distribution_encoding<sequence<1>,
@@ -277,6 +282,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
             {
                 constexpr index_t M0 = BlockSize / get_warp_size();
                 constexpr index_t M1 = MPerBlock / (M2 * M0);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M1, M2 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
                 return make_static_tile_distribution(
                     tile_distribution_encoding<sequence<1>,
                                                tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
@@ -350,6 +358,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
                 static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
                 static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
                 constexpr index_t N0 = NPerBlock / (N2 * N1);
+                static_assert(N0 * N1 * N2 == NPerBlock,
+                              "Incorrect N0, N1, N2 configuration! "
+                              "N0, N1, N2 must cover whole NPerBlock!");
 
                 return make_static_tile_distribution(
                     tile_distribution_encoding<sequence<1>,
@@ -364,7 +375,9 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
             {
                 constexpr index_t N0 = BlockSize / get_warp_size();
                 constexpr index_t N1 = NPerBlock / (N2 * N0);
-
+                static_assert(N0 * N1 * N2 == NPerBlock,
+                              "Incorrect N0, N1, N2 configuration! "
+                              "N0, N1, N2 must cover whole NPerBlock!");
                 return make_static_tile_distribution(
                     tile_distribution_encoding<sequence<1>,
                                                tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
@@ -475,9 +488,28 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1DefaultPolicy;
-
-        return BlockGemmASmemBSmemCRegV1<Problem, BlockGemmPolicy>{};
+        constexpr bool TransposeC = false;
+        constexpr auto I0         = number<0>{};
+        constexpr auto I1         = number<1>{};
+        constexpr auto I2         = number<2>{};
+
+        using AccDataType     = float;
+        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                AccDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                TransposeC>;
+        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                      typename Problem::BDataType,
+                                                                      typename Problem::CDataType,
+                                                                      BlockWarps,
+                                                                      WarpGemm>;
+
+        return BlockUniversalGemmAsBsCr<Problem, BlockGemmPolicy>{};
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 3c43790bd..bf51577ae 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -33,6 +33,8 @@ struct GemmPipelineProblemBase
     static constexpr bool kPadN = GemmTraits::kPadN;
     static constexpr bool kPadK = GemmTraits::kPadK;
 
+    static constexpr auto Scheduler = GemmPipelineScheduler::Default;
+
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA()
     {
         if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index 0a8d2dfbe..a9e466a79 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,9 +21,10 @@ struct WarpGemmAtrributeMfma
     using BVecType = typename Impl::BVecType;
     using CVecType = typename Impl::CVecType;
 
-    static constexpr index_t kM = Impl::kM;
-    static constexpr index_t kN = Impl::kN;
-    static constexpr index_t kK = Impl::kK;
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
@@ -86,9 +87,10 @@ struct WarpGemmAtrributeMfmaIterateK
         ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
     using CVecType = typename Impl::CVecType;
 
-    static constexpr index_t kM = Impl::kM;
-    static constexpr index_t kN = Impl::kN;
-    static constexpr index_t kK = Impl::kK * kKIter;
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
@@ -197,9 +199,10 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
     using BVecType = typename Impl::AVecType;
     using CVecType = typename Impl::CVecType;
 
-    static constexpr index_t kM = Impl::kN;
-    static constexpr index_t kN = Impl::kM;
-    static constexpr index_t kK = Impl::kK;
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
@@ -260,9 +263,10 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
     using BVecType = typename Impl::AVecType;
     using CVecType = typename Impl::CVecType;
 
-    static constexpr index_t kM = Impl::kN;
-    static constexpr index_t kN = Impl::kM;
-    static constexpr index_t kK = Impl::kK;
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
@@ -330,9 +334,10 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
         ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
     using CVecType = typename Impl::CVecType;
 
-    static constexpr index_t kM = Impl::kN;
-    static constexpr index_t kN = Impl::kM;
-    static constexpr index_t kK = Impl::kK * kKIter;
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
@@ -444,10 +449,11 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
         ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
     using CVecType = typename Impl::CVecType;
 
-    static constexpr index_t kM      = Impl::kN;
-    static constexpr index_t kN      = Impl::kM;
-    static constexpr index_t kK      = Impl::kK * kKIter;
-    static constexpr index_t SFactor = SFactor_; // group how many CM1 together
+    static constexpr index_t kM          = Impl::kN;
+    static constexpr index_t kN          = Impl::kM;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
+    static constexpr index_t SFactor     = SFactor_; // group how many CM1 together
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
@@ -583,10 +589,11 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
         ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
     using CVecType = typename Impl::CVecType;
 
-    static constexpr index_t kM      = Impl::kM;
-    static constexpr index_t kN      = Impl::kN;
-    static constexpr index_t kK      = Impl::kK * kKIter;
-    static constexpr index_t SFactor = SFactor_; // group how many CM1 together
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK * kKIter;
+    static constexpr index_t kKPerThread = Impl::kABKPerLane * kKIter;
+    static constexpr index_t SFactor     = SFactor_; // group how many CM1 together
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
index 182d023a0..f9d50ed35 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -14,6 +14,11 @@ struct WarpGemmImpl
     static constexpr index_t kM = WarpGemmAttribute::kM;
     static constexpr index_t kN = WarpGemmAttribute::kN;
     static constexpr index_t kK = WarpGemmAttribute::kK;
+    /// @brief The number of elements in K dimension processed by single thread in wavefront.
+    ///
+    /// @note  Note that WarpGemm may run MFMA instruction multiple times (on different K).
+    ///        In such situation this value reflects this fact.
+    static constexpr index_t kKPerThread = WarpGemmAttribute::kKPerThread;
 
     using ADataType = typename WarpGemmAttribute::ADataType;
     using BDataType = typename WarpGemmAttribute::BDataType;
-- 
GitLab


From cf2d635ea27c074e7025896514c4b94034d370cc Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Tue, 26 Nov 2024 20:37:54 +0800
Subject: [PATCH 080/153] [CK_TILE] Fix incorrect computation of group mode
 PagedAttention (#1688)

* Allow getting batch size from splitkv tile partitioner

* Fix wrong paged-kvcache impl for group mode

* Fix wrong example code for page-kvcache

* Undo changes in fmha_fwd.cpp

* Always use 2D block table

* Add is_gappy kernel argument for paged-kvcache

The is_gappy argument is used for differentiating seqstart_k_ptr usage
in flash-attention & xformers

* Remove out-of-date comments

* Remove no-longer used method

* Fix wrong # page-block calculation

* Fix wrong comment

---------

Co-authored-by: Qianfeng <qianfeng.zhang@amd.com>
---
 example/ck_tile/01_fmha/fmha_fwd.cpp          |  1 +
 example/ck_tile/01_fmha/fmha_fwd.hpp          | 12 +++
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   | 56 +++++++-----
 .../fmha_fwd_splitkv_tile_partitioner.hpp     | 10 +--
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 90 +++++++++++--------
 5 files changed, 105 insertions(+), 64 deletions(-)

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 00e0a1653..1f0d73d95 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1046,6 +1046,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                     (0 < page_block_size ? block_table_buf.GetDeviceBuffer() : nullptr);
                 args.batch_stride_block_table = batch_stride_block_table;
                 args.page_block_size          = page_block_size;
+                args.is_gappy = false; // use 'false' for flash-attention integration
 
                 args.cache_batch_idx =
                     (use_cache_batch_idx ? cache_batch_idx_buf.GetDeviceBuffer() : nullptr);
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 704453baa..8a821b917 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -165,6 +165,8 @@ struct fmha_fwd_splitkv_args
     void* block_table_ptr;
     ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
     ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
+    bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not
+                   // nullptr.
 
     const void* cache_batch_idx;
 
@@ -173,12 +175,21 @@ struct fmha_fwd_splitkv_args
     //             seqlen_k = kargs.seqlen_k
     // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
     //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
+    //
     // batch mode (kvcache):
     //             seqlen_q = kargs.seqlen_q
     //             seqlen_k = kargs.seqlen_k_ptr[b]
     // group mode (kvcache):
     //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //
+    //     when is_gappy=true:
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    //             seqstart_k_ptr[b] now store local offset of each batch
+    //
+    //     when is_gappy=false:
     //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
     const void* seqstart_q_ptr;
     const void* seqstart_k_ptr;
     const void* seqlen_k_ptr;
@@ -395,6 +406,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                      args.block_table_ptr,
                                      args.batch_stride_block_table,
                                      args.page_block_size,
+                                     args.is_gappy,
                                      args.scale_s,
                                      args.scale_p,
                                      args.stride_q,
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 3c4e02d08..dcb671d81 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -172,13 +172,18 @@ struct FmhaFwdSplitKVKernel
         float scale_p;
     };
 
-    struct PageBlockTableKargs
+    struct CommonPageBlockTableKargs
     {
         const int32_t* block_table_ptr;
         ck_tile::index_t batch_stride_block_table;
         ck_tile::index_t page_block_size;
     };
 
+    struct GroupModePageBlockTableKargs : CommonPageBlockTableKargs
+    {
+        bool is_gappy = false;
+    };
+
     struct CacheBatchIdxKargs
     {
         const int32_t* cache_batch_idx;
@@ -193,7 +198,7 @@ struct FmhaFwdSplitKVKernel
                                                 EmptyKargs<0>>>,
           std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>,
           std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>,
-          std::conditional_t<kIsPagedKV, PageBlockTableKargs, CacheBatchIdxKargs>
+          std::conditional_t<kIsPagedKV, CommonPageBlockTableKargs, CacheBatchIdxKargs>
     {
         const int32_t* seqlen_k_ptr;
 
@@ -215,7 +220,7 @@ struct FmhaFwdSplitKVKernel
                                                 EmptyKargs<0>>>,
           std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>,
           std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>,
-          std::conditional_t<kIsPagedKV, PageBlockTableKargs, EmptyKargs<3>>
+          std::conditional_t<kIsPagedKV, GroupModePageBlockTableKargs, EmptyKargs<3>>
     {
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
@@ -375,6 +380,7 @@ struct FmhaFwdSplitKVKernel
               const void* block_table_ptr,
               ck_tile::index_t batch_stride_block_table,
               ck_tile::index_t page_block_size,
+              bool is_gappy,
               float scale_s,
               float scale_p,
               ck_tile::index_t stride_q,
@@ -461,6 +467,7 @@ struct FmhaFwdSplitKVKernel
             kargs.block_table_ptr          = reinterpret_cast<const int32_t*>(block_table_ptr);
             kargs.batch_stride_block_table = batch_stride_block_table;
             kargs.page_block_size          = page_block_size;
+            kargs.is_gappy                 = is_gappy;
         }
 
         return kargs;
@@ -495,11 +502,13 @@ struct FmhaFwdSplitKVKernel
         const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q       = 0;
-        long_index_t batch_offset_k       = 0;
-        long_index_t batch_offset_v       = 0;
+        long_index_t batch_offset_k       = 0; // unused for paged-kvcache
+        long_index_t batch_offset_v       = 0; // unused for paged-kvcache
         long_index_t batch_offset_bias    = 0;
         long_index_t batch_offset_lse_acc = 0;
         long_index_t batch_offset_o_acc   = 0;
+        index_t kv_l2p_offset =
+            0; // logical-to-physical offset of seqlen_k coordinate. only used for paged-kvcache
 
         if constexpr(kIsGroupMode)
         {
@@ -508,22 +517,14 @@ struct FmhaFwdSplitKVKernel
             const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
 
             batch_offset_q = query_start * kargs.stride_q;
-            if constexpr(kIsPagedKV)
+            batch_offset_k = key_start * kargs.stride_k;
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
-                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+                batch_offset_v = key_start * kargs.stride_v;
             }
             else
             {
-                batch_offset_k = key_start * kargs.stride_k;
-                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-                {
-                    batch_offset_v = key_start * kargs.stride_v;
-                }
-                else
-                {
-                    batch_offset_v = key_start;
-                }
+                batch_offset_v = key_start;
             }
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
@@ -551,6 +552,15 @@ struct FmhaFwdSplitKVKernel
             {
                 kargs.seqlen_k = kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch];
             }
+
+            if constexpr(kIsPagedKV)
+            {
+                if(kargs.is_gappy)
+                {
+                    // seqstart_k_ptr has different meaning in this case
+                    kv_l2p_offset = kargs.seqstart_k_ptr[i_batch];
+                }
+            }
         }
         else
         {
@@ -703,7 +713,7 @@ struct FmhaFwdSplitKVKernel
                     reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
                     i_batch_ * kargs.batch_stride_block_table;
                 const index_t num_blocks =
-                    integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size);
+                    integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
 
                 const long_index_t fixed_offset =
                     static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
@@ -718,7 +728,8 @@ struct FmhaFwdSplitKVKernel
                     kargs.page_block_size,
                     k_dram,
                     make_k_dram(nullptr,
-                                kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size));
+                                (kv_l2p_offset + kargs.seqlen_k) -
+                                    (num_blocks - 1) * kargs.page_block_size));
             }
             else
             {
@@ -733,7 +744,7 @@ struct FmhaFwdSplitKVKernel
                     reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
                     i_batch_ * kargs.batch_stride_block_table;
                 const index_t num_blocks =
-                    integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size);
+                    integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
 
                 const long_index_t fixed_offset =
                     static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
@@ -748,7 +759,8 @@ struct FmhaFwdSplitKVKernel
                     kargs.page_block_size,
                     v_dram,
                     make_v_dram(nullptr,
-                                kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size));
+                                (kv_l2p_offset + kargs.seqlen_k) -
+                                    (num_blocks - 1) * kargs.page_block_size));
             }
             else
             {
@@ -896,6 +908,7 @@ struct FmhaFwdSplitKVKernel
                                       mask,
                                       position_encoding,
                                       kargs.scale_s,
+                                      kv_l2p_offset,
                                       smem_ptr);
             }
             else
@@ -912,6 +925,7 @@ struct FmhaFwdSplitKVKernel
                                       mask,
                                       position_encoding,
                                       kargs.scale_s,
+                                      kv_l2p_offset,
                                       smem_ptr);
             }
         }();
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
index 675a31019..5a52fa0f6 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
@@ -18,11 +18,11 @@ struct FmhaFwdSplitKVTilePartitioner
     static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1;
     static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1;
 
-    __host__ static constexpr auto GridSize(ck_tile::index_t batch_size,
-                                            ck_tile::index_t nhead,
-                                            ck_tile::index_t max_seqlen_q,
-                                            ck_tile::index_t hdim_v,
-                                            ck_tile::index_t num_splits)
+    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
+                                                ck_tile::index_t nhead,
+                                                ck_tile::index_t max_seqlen_q,
+                                                ck_tile::index_t hdim_v,
+                                                ck_tile::index_t num_splits)
     {
         // TODO: this may need tuning
         return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) *
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 4e8d8694d..04aa85644 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -143,6 +143,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
                void* smem_ptr) const
     {
         static_assert(
@@ -211,16 +212,16 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
         set_tile(m, -numeric<SMPLComputeDataType>::infinity());
         clear_tile(l);
 
-        const auto q_origin                       = q_dram_window.get_window_origin();
-        const auto [seqlen_k_start, seqlen_k_end] = mask.GetTileRangeAlongX(
+        const auto q_origin = q_dram_window.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] = mask.GetTileRangeAlongX(
             q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
 
         // check early exit if no work to do
         if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
         {
-            const index_t original_num_total_loop =
-                integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0);
-            if(original_num_total_loop <= 0)
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
             {
                 if constexpr(kStoreLSE)
                 {
@@ -239,33 +240,41 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             }
         }
 
-        // make sure the first tile is completely located in page-block
-        const index_t adjusted_seqlen_k_start = [&, seqlen_k_start_ = seqlen_k_start] {
-            if constexpr(kIsPagedKV)
-            {
-                return kN0 * integer_divide_floor(seqlen_k_start_, kN0);
-            }
-            else
-            {
-                return seqlen_k_start_;
-            }
-        }();
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start + kv_l2p_offset;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end + kv_l2p_offset;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start =
+            [&, physical_seqlen_k_start_ = physical_seqlen_k_start] {
+                if constexpr(kIsPagedKV)
+                {
+                    return kN0 * integer_divide_floor(physical_seqlen_k_start_, kN0);
+                }
+                else
+                {
+                    return physical_seqlen_k_start_;
+                }
+            }();
         const index_t num_total_loop =
-            integer_divide_ceil(seqlen_k_end - adjusted_seqlen_k_start, kN0);
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
 
         auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window(
-            k_dram_block_window_lengths, {adjusted_seqlen_k_start, 0});
+            k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0});
 
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
         auto bias_dram_window =
             make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
-                             {bias_origin.at(number<0>{}), adjusted_seqlen_k_start}, // M/N
+                             {bias_origin.at(number<0>{}),
+                              logical_seqlen_k_start - (physical_seqlen_k_start -
+                                                        aligned_physical_seqlen_k_start)}, // M/N
                              Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
 
         auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
             v_dram_block_window_lengths,
-            {0, adjusted_seqlen_k_start}, // TODO: hdim split?
+            {0, aligned_physical_seqlen_k_start}, // TODO: hdim split?
             Policy::template MakeVDramTileDistribution<Problem>());
 
         auto q_tile = tile_elementwise_in(q_element_func, q);
@@ -379,7 +388,8 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                         constexpr auto i_j_idx = make_tuple(idx0, idx1);
 
                         s_acc(i_j_idx) *= scale_s;
-                        position_encoding.update(s_acc(i_j_idx), row, col);
+                        // position_encoding accept only logical coordinates, do conversion here
+                        position_encoding.update(s_acc(i_j_idx), row, col - kv_l2p_offset);
                     });
                 });
             }
@@ -397,29 +407,31 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             {
                 const auto k_origin = k_page_block_navigator.to_global_window_origin(
                     i_page_block_k, k_dram_block_window.get_window_origin());
-                set_tile_if(s_acc,
-                            -numeric<SMPLComputeDataType>::infinity(),
-                            [&, seqlen_k_start_ = seqlen_k_start, seqlen_k_end_ = seqlen_k_end](
-                                auto tile_idx) {
-                                const auto col =
-                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                                if constexpr(kIsPagedKV)
-                                {
-                                    return col < seqlen_k_start_ || seqlen_k_end_ <= col;
-                                }
-                                else
-                                {
-                                    return seqlen_k_end_ <= col;
-                                }
-                            });
+                set_tile_if(
+                    s_acc,
+                    -numeric<SMPLComputeDataType>::infinity(),
+                    [&,
+                     physical_seqlen_k_start_ = physical_seqlen_k_start,
+                     physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                        const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                        if constexpr(kIsPagedKV)
+                        {
+                            return col < physical_seqlen_k_start_ || physical_seqlen_k_end_ <= col;
+                        }
+                        else
+                        {
+                            return physical_seqlen_k_end_ <= col;
+                        }
+                    });
             }
 
             if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
                 const auto k_origin = k_page_block_navigator.to_global_window_origin(
                     i_page_block_k, k_dram_block_window.get_window_origin());
+                // mask accept only logical coordinates, do conversion here
                 bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}),
-                                                           k_origin.at(number<0>{}),
+                                                           k_origin.at(number<0>{}) - kv_l2p_offset,
                                                            number<kM0>{},
                                                            number<kN0>{});
                 if(need_perpixel_check)
@@ -428,7 +440,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                         s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
                             const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
                             const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return mask.IsOutOfBound(row, col);
+                            return mask.IsOutOfBound(row, col - kv_l2p_offset);
                         });
                 }
             }
@@ -659,6 +671,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
                void* smem_ptr) const
     {
         return operator()(q_dram_block_window_tmp,
@@ -681,6 +694,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                           mask,
                           position_encoding,
                           scale_s,
+                          kv_l2p_offset,
                           smem_ptr);
     }
 };
-- 
GitLab


From b70f367f8051e0c66071a25ab95a77e076762808 Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Tue, 26 Nov 2024 13:56:32 +0100
Subject: [PATCH 081/153] Add check for bf16 splitk support for grouped gemm
 splitk (#1673)

* add check for bf16 splitk support for grouped gemm splitk

* Update if condition

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 .../device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 6d9d1459c..cb0afbb08 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -538,6 +538,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
             return false;
         }
 
+        if(std::is_same_v<EDataType, ck::bhalf_t> && arg.K_BATCH > 1 && !is_bf16_atomic_supported())
+        {
+            return false;
+        }
+
         bool supported = true;
         for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
         {
-- 
GitLab


From bfe983a1518935ef8d81066b540b8aea51b8e883 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Tue, 26 Nov 2024 17:36:53 +0100
Subject: [PATCH 082/153] Change block gemm pipeline local prefill loop order.
 (#1692)

* Fix loop order.

* Fix loop order in pipeline v4
---
 .../blockwise_gemm_pipeline_xdlops_v2.hpp     | 130 +++++++++---------
 .../blockwise_gemm_pipeline_xdlops_v4.hpp     |  65 +++++----
 2 files changed, 96 insertions(+), 99 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
index 711c47854..54edf0c35 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
@@ -269,15 +269,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                                a_thread_desc_,
                                                make_tuple(m0, I0, k, I0),
                                                a_thread_buf);
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                b_thread_copy_.Run(
-                                    b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                    b_block_buf,
-                                    b_thread_desc_,
-                                    make_tuple(n0, I0, k, I0),
-                                    b_thread_buf);
-                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_buf);
                         });
                     });
 
@@ -341,14 +340,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                            a_thread_desc_,
                                            make_tuple(m0, I0, k, I0),
                                            a_thread_buf);
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                               b_block_buf,
-                                               b_thread_desc_,
-                                               make_tuple(n0, I0, k, I0),
-                                               b_thread_buf);
-                        });
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
                     });
                 });
 
@@ -396,14 +395,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                        a_thread_desc_,
                                        make_tuple(m0, I0, k, I0),
                                        a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_buf);
                 });
             });
 
@@ -447,14 +446,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
                                        a_thread_desc_,
                                        make_tuple(m0, I0, k, I0),
                                        a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_buf);
                 });
             });
 
@@ -760,15 +759,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                                a_thread_desc_,
                                                make_tuple(m0, I0, k0, I0),
                                                a_thread_buf);
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                b_thread_copy_.Run(
-                                    b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                    b_block_buf,
-                                    b_thread_desc_,
-                                    make_tuple(n0, I0, k0, I0),
-                                    b_thread_buf);
-                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k0, I0),
+                                               b_thread_buf);
                         });
                         __builtin_amdgcn_sched_barrier(0);
                         // NOTE: Synchronize threads in a workgroup at the start of each MAC
@@ -866,14 +864,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                            a_thread_desc_,
                                            make_tuple(m0, I0, k0, I0),
                                            a_thread_buf);
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                               b_block_buf,
-                                               b_thread_desc_,
-                                               make_tuple(n0, I0, k0, I0),
-                                               b_thread_buf);
-                        });
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
                     });
 
                     __builtin_amdgcn_sched_barrier(0);
@@ -942,14 +940,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                        a_thread_desc_,
                                        make_tuple(m0, I0, k0, I0),
                                        a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k0, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k0, I0),
+                                       b_thread_buf);
                 });
 
                 __builtin_amdgcn_sched_barrier(0);
@@ -1018,14 +1016,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
                                        a_thread_desc_,
                                        make_tuple(m0, I0, k0, I0),
                                        a_thread_buf);
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k0, I0),
-                                           b_thread_buf);
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k0, I0),
+                                       b_thread_buf);
                 });
 
                 __builtin_amdgcn_sched_barrier(0);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
index bd5a1bedf..e8d105111 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
@@ -305,14 +305,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                    a_thread_desc_,
                                    make_tuple(m0, I0, k, I0),
                                    a_thread_bufs(I0));
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                       b_block_buf.At(I0),
-                                       b_thread_desc_,
-                                       make_tuple(n0, I0, k, I0),
-                                       b_thread_bufs(I0));
-                });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                   make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                   b_block_buf.At(I0),
+                                   b_thread_desc_,
+                                   make_tuple(n0, I0, k, I0),
+                                   b_thread_bufs(I0));
             });
         });
 
@@ -356,15 +356,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                                a_thread_desc_,
                                                make_tuple(m0, I0, k, I0),
                                                a_thread_bufs(lds_read_reg_buf));
-                            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                b_thread_copy_.Run(
-                                    b_block_desc_n0_n1_n2_k,
-                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                    b_block_buf.At(lds_read_buf),
-                                    b_thread_desc_,
-                                    make_tuple(n0, I0, k, I0),
-                                    b_thread_bufs(lds_read_reg_buf));
-                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf.At(lds_read_buf),
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_bufs(lds_read_reg_buf));
                         });
                     });
 
@@ -437,14 +436,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                        a_thread_desc_,
                                        make_tuple(m0, I0, k, I0),
                                        a_thread_bufs(lds_read_reg_buf));
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf.At(lds_read_buf),
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_bufs(lds_read_reg_buf));
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
                 });
             });
 
@@ -496,14 +495,14 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                                        a_thread_desc_,
                                        make_tuple(m0, I0, k, I0),
                                        a_thread_bufs(lds_read_reg_buf));
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
-                                           b_block_buf.At(lds_read_buf),
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_bufs(lds_read_reg_buf));
-                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
                 });
             });
 
-- 
GitLab


From abae2afc721d9b335ef07d7227e0f9e55b1c575a Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Wed, 27 Nov 2024 05:01:15 +0800
Subject: [PATCH 083/153] support max3 in smoothquant and add+ rmsnorm +
 rdquant (#1654)

* Fix cmake example build

* Support max3 in smoothquant one pass

* support max3 in two pass

* support max3 in add_rmsnorm_rdquant
---
 example/ck_tile/12_smoothquant/CMakeLists.txt |  4 +-
 ...msnorm2d_rdquant_fwd_pipeline_one_pass.hpp | 37 +++++++++++++++----
 ...norm2d_rdquant_fwd_pipeline_three_pass.hpp | 26 ++++++++++---
 .../smoothquant_pipeline_one_pass.hpp         | 30 +++++++++++++--
 .../smoothquant_pipeline_two_pass.hpp         | 16 +++++++-
 5 files changed, 94 insertions(+), 19 deletions(-)

diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt
index 09a56c6da..3849833ac 100644
--- a/example/ck_tile/12_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -18,7 +18,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC)
     target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS})
 endfunction(add_smoothquant_example TARGET_NAME MAIN_SRC)
 
-file(GLOB INSTANCE_SRCS instances/*.cpp)
 
-add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS})
 add_smoothquant_example(tile_example_smoothquant example_smoothquant.cpp)
+file(GLOB INSTANCE_SRCS instances/*.cpp)
+add_smoothquant_example(tile_smoothquant smoothquant.cpp ${INSTANCE_SRCS})
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
index 12a15938a..24f35d363 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
@@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
     static constexpr bool kSaveX    = Problem::kSaveX;
 
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
-    static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
-    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadM   = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
+    static constexpr bool kPadN   = Problem::kPadN;
+    static constexpr bool UseMax3 = true; // TODO - Move to trait
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
@@ -69,9 +70,16 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
         auto reduce_square_sum_func = ReduceOp::SquareAdd{};
         auto reduce_sum_func        = ReduceOp::Add{};
         auto reduce_absmax_func     = ReduceOp::AbsMax{};
-        auto reduce_max_func        = ReduceOp::Max{};
-        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
-        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+        auto reduce_absmax3_func    = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
         auto block_reduce2d_cross_warp_sync =
             Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
 
@@ -116,8 +124,23 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
         });
 
         // compute absmax, each-thread->cross-lane->cross-warp
-        auto absmax = block_reduce2d(
-            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        auto absmax = [&]() {
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+            {
+                return block_reduce2d(y,
+                                      reduce_absmax_func.GetIdentityValue<ComputeDataType>(),
+                                      reduce_absmax3_func,
+                                      sequence<1, 2>{});
+            }
+            else
+            {
+                return block_reduce2d(
+                    y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            }
+        }();
         block_reduce2d_sync(absmax, reduce_max_func);
         block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
 
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
index 0dbb20645..aec7368e2 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
@@ -28,8 +28,9 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
     static constexpr bool kSaveX    = Problem::kSaveX;
 
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
-    static constexpr bool kPadM = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
-    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadM   = false; // TODO - BlockAddRmsnorm2dRdquantFwdProblem::kPadM
+    static constexpr bool kPadN   = Problem::kPadN;
+    static constexpr bool UseMax3 = true; // TODO - Move to trait
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
@@ -76,9 +77,16 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
         auto reduce_square_sum_func = ReduceOp::SquareAdd{};
         auto reduce_sum_func        = ReduceOp::Add{};
         auto reduce_absmax_func     = ReduceOp::AbsMax{};
-        auto reduce_max_func        = ReduceOp::Max{};
-        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
-        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+        auto reduce_absmax3_func    = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
         auto block_reduce2d_cross_warp_sync =
             Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
 
@@ -177,7 +185,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
                 y(idx) = type_convert<ComputeDataType>(y_);
             });
 
-            block_reduce2d(y, absmax, reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+                block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{});
+            else
+                block_reduce2d(y, absmax, reduce_absmax_func);
 
             if constexpr(kSaveX)
                 move_tile_window(x_window, {0, -Block_N});
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
index d5b3780de..b2fc240c1 100644
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineOnePass
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
     static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
     static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
@@ -52,7 +53,15 @@ struct SmoothquantPipelineOnePass
             xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
 
         auto reduce_absmax_func  = ReduceOp::AbsMax{};
-        auto reduce_max_func     = ReduceOp::Max{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func = ReduceOp::Max{};
+
         auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
         auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
         auto block_reduce2d_cross_warp_sync =
@@ -68,8 +77,23 @@ struct SmoothquantPipelineOnePass
             xscale);
 
         // compute absmax, cross-lane->cross-warp
-        auto absmax = block_reduce2d(
-            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        auto absmax = [&]() {
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+            {
+                return block_reduce2d(y,
+                                      reduce_absmax_func.GetIdentityValue<ComputeDataType>(),
+                                      reduce_absmax3_func,
+                                      sequence<1, 2>{});
+            }
+            else
+            {
+                return block_reduce2d(
+                    y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            }
+        }();
         block_reduce2d_sync(absmax, reduce_max_func);
         block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
 
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
index 7878ef1d3..9e9df663b 100644
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineTwoPass
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
     static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
     static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait
 
     static constexpr const char* name = []() {
         if constexpr(kNeedCrossWarpSync)
@@ -56,6 +57,13 @@ struct SmoothquantPipelineTwoPass
             __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
 
         auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
         auto reduce_max_func     = ReduceOp::Max{};
         auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
         auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
@@ -77,7 +85,13 @@ struct SmoothquantPipelineTwoPass
                 x,
                 xscale);
 
-            block_reduce2d(y, absmax, reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+                block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{});
+            else
+                block_reduce2d(y, absmax, reduce_absmax_func);
 
             move_tile_window(x_window, {0, Block_N});
             move_tile_window(xscale_window, {Block_N});
-- 
GitLab


From cb8c7f42d6123f548306cbd679c3d18349f10b6d Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 26 Nov 2024 14:58:35 -0800
Subject: [PATCH 084/153] update mainline compiler branch name (#1696)

---
 Dockerfile  |  4 ++--
 Jenkinsfile | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 76e6f0ebe..38a563ce3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -116,7 +116,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'" && \
     sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
@@ -124,7 +124,7 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd
     else echo "using the release compiler"; \
     fi
 
-RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
         cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
diff --git a/Jenkinsfile b/Jenkinsfile
index 2f790d8e5..b448a5130 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -133,7 +133,7 @@ def buildDocker(install_prefix){
     def image_name = getDockerImageName()
     echo "Building Docker for ${image_name}"
     def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
-    if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
+    if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
         dockerArgs = dockerArgs + " --no-cache "
     }
     echo "Build Args: ${dockerArgs}"
@@ -358,7 +358,7 @@ def buildHipClangJob(Map conf=[:]){
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
+        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
             dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
         }
         def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
@@ -549,7 +549,7 @@ def Build_CK(Map conf=[:]){
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
+        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
             dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
         }
         if(params.BUILD_LEGACY_OS){
@@ -737,7 +737,7 @@ def process_results(Map conf=[:]){
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
                                               0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
                                               0 13 * * * % BUILD_LEGACY_OS=true''' : ""
 
@@ -765,7 +765,7 @@ pipeline {
         string(
             name: 'COMPILER_VERSION', 
             defaultValue: '', 
-            description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline-open, or leave blank (default).')
+            description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).')
         string(
             name: 'COMPILER_COMMIT', 
             defaultValue: '', 
-- 
GitLab


From 061ac0649c75deb315a418466d00dea2c49e65f3 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 27 Nov 2024 13:02:44 +0100
Subject: [PATCH 085/153] Polished Grouped GEMM APIs and new BF16 instances
 (#1600)

* Few small fixes.

* New GroupedGemm instances (BF16)

* Unify and refactor GroupedGEMM device API.

* Adapt changes to new API.

* Adapt grouped gemm profiler.

* Accept multiple kbatches for grouped gemm profiler.

- delete obsolete two stage as it is now covered by grouped gemm

* Update unit test for grouped gemm.

* Fix thresholds for BF16 and F8. Unblock tests.

* Fix few instances.

* Multiple small fixes.

* Adapt to new API, check dynamic casting.

* Uncomment few data types in grouped gemm profiler.

* Fix call to SetDeviceArgs.

* Fix profile grouped gemm multiply tile loop.

* Fix grouped gemm tile loop kernel args in client examples.

* Review comments.
---
 ...emm_multiply_bias_fastgelu_xdl_bf16_i8.cpp |   2 +-
 .../grouped_gemm_multiply_xdl_bf16_i8.cpp     |   2 +-
 ...rouped_gemm_multiple_d_splitk_xdl_fp16.cpp |   4 +-
 .../grouped_gemm_multiple_d_xdl_fp16.cpp      |   2 +-
 .../grouped_gemm_xdl_fixed_nk_bias_fp16.cpp   |   4 +-
 .../grouped_gemm_xdl_fixed_nk_fp16.cpp        |   4 +-
 .../grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp    |   4 +-
 .../run_grouped_gemm_example.inc              |  18 +-
 .../gpu/device/device_grouped_gemm.hpp        | 132 ++++++-
 .../device/device_grouped_gemm_fixed_nk.hpp   |  50 +--
 .../device_grouped_gemm_multiple_d_splitk.hpp | 136 -------
 .../gpu/device/device_grouped_gemm_splitk.hpp |  20 +-
 .../device/device_grouped_gemm_tile_loop.hpp  |  92 +----
 ...ltiple_d_splitk_xdl_cshuffle_two_stage.hpp |  93 +++--
 ...gemm_multiple_d_xdl_cshuffle_tile_loop.hpp |  24 +-
 .../device/impl/device_grouped_gemm_xdl.hpp   |  21 +-
 .../impl/device_grouped_gemm_xdl_fixed_nk.hpp |  72 +++-
 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp |  35 +-
 include/ck/utility/loop_scheduler.hpp         |   1 -
 .../gpu/grouped_gemm.hpp                      | 185 ++++++++-
 ...evice_grouped_gemm_xdl_splitk_instance.hpp | 138 +++++++
 .../gpu/grouped_gemm/CMakeLists.txt           |  22 +-
 ..._bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp |  32 ++
 ...bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp |  36 ++
 ..._bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp |  33 ++
 ..._bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp |  32 ++
 ...bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp |  36 ++
 ..._bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp |  38 ++
 ..._bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp |  32 ++
 ...bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp |  36 ++
 ..._bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp |  33 ++
 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp |  47 +--
 ...16_f16_f16_mk_kn_mn_irregular_instance.cpp | 123 ------
 ...itk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp |  32 ++
 ...6_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp |  36 ++
 ...itk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp |  33 ++
 ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp |  51 +--
 ...16_f16_f16_mk_nk_mn_irregular_instance.cpp |  55 +--
 ...ultiply_bf16_i8_bf16_mk_kn_mn_instance.cpp | 234 -----------
 .../profiler/profile_grouped_gemm_impl.hpp    | 121 +++---
 ...e_grouped_gemm_multiply_tile_loop_impl.hpp |   3 +-
 .../profile_grouped_gemm_tile_loop_impl.hpp   |   2 +-
 .../profile_grouped_gemm_two_stage_impl.hpp   | 367 ------------------
 profiler/src/CMakeLists.txt                   |   1 -
 profiler/src/profile_grouped_gemm.cpp         |  89 ++++-
 .../src/profile_grouped_gemm_fixed_nk.cpp     |   8 +-
 .../src/profile_grouped_gemm_two_stage.cpp    | 228 -----------
 test/grouped_gemm/CMakeLists.txt              |   6 -
 .../test_grouped_gemm_splitk_xdl.cpp          |  46 ++-
 .../test_grouped_gemm_ut_cases.inc            | 131 +------
 test/grouped_gemm/test_grouped_gemm_util.hpp  | 139 +++----
 51 files changed, 1399 insertions(+), 1722 deletions(-)
 delete mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp
 delete mode 100644 profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
 delete mode 100644 profiler/src/profile_grouped_gemm_two_stage.cpp

diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
index 4b284c74d..47d3e0abf 100644
--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
@@ -121,7 +121,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
     constexpr ck::index_t NumDTensor = 2;
 
     using GroupedGemmKernelArgument =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+        ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;
 
     std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
     grouped_gemm_kernel_args_.reserve(group_count);
diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
index 6cc83e06f..8c705d3bc 100644
--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
@@ -120,7 +120,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
     constexpr ck::index_t NumDTensor = 1;
 
     using GroupedGemmKernelArgument =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+        ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;
 
     std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
     grouped_gemm_kernel_args_.reserve(group_count);
diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
index ecff7b471..8bbf8e629 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
@@ -246,7 +246,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
     // do GEMM
     auto argument = gemm.MakeArgument(
         p_As, p_Bs, p_Ds, p_Cs, gemm_descs, a_element_op, b_element_op, cde_element_op);
-    gemm.SetKBatchSize(argument, config.k_batch);
+    gemm.SetKBatchSize(&argument, config.k_batch);
     if(!gemm.IsSupportedArgument(argument))
     {
         throw std::runtime_error(
@@ -257,7 +257,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
     gemm.SetWorkSpacePointer(&argument, gemm_workspace_dev.GetDeviceBuffer());
 
     DeviceMem gemm_arg_dev_mem(gemm.GetDeviceKernelArgSize(&argument));
-    gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer());
+    gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer());
 
     invoker.Run(argument, StreamConfig{nullptr, false, 1});
 
diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
index 965a0e7e3..e7b2ee417 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -91,7 +91,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 {
     auto group_count = problem_size.group_count;
 
-    using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDs>;
+    using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument<NumDs>;
     using GemmDesc        = ck::tensor_operation::device::GemmDesc;
 
     // GEMM shape
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
index a193fc39b..3b3ef508c 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -254,7 +254,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
                               gemm.GetDeviceKernelArgSize(&argument),
                               hipMemcpyHostToDevice));
 
-    gemm.SetDeviceKernelArgs(argument, gemm_kernel_args_dev.GetDeviceBuffer());
+    gemm.SetDeviceKernelArgs(&argument, gemm_kernel_args_dev.GetDeviceBuffer());
     gemm.SetKBatch(argument, config.k_batch);
 
     invoker.Run(argument, StreamConfig{nullptr, false});
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
index 1a2bcfb33..c1043f419 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -239,7 +239,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             "not support this GEMM problem");
     }
 
-    gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer());
+    gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer());
     gemm.SetKBatch(argument, config.k_batch);
 
     invoker.Run(argument, StreamConfig{nullptr, false});
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
index 0a63a2984..c81874b06 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -240,7 +240,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             "not support this GEMM problem");
     }
 
-    gemm.SetDeviceKernelArgs(argument, gemm_arg_dev_mem.GetDeviceBuffer());
+    gemm.SetDeviceKernelArgs(&argument, gemm_arg_dev_mem.GetDeviceBuffer());
     gemm.SetKBatch(argument, config.k_batch);
 
     invoker.Run(argument, StreamConfig{nullptr, false});
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index 320870e0d..7cb0588b8 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -168,9 +168,23 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
     auto argument = gemm.MakeArgument(
         p_a, p_b, p_Ds, p_c, gemm_descs, a_element_op, b_element_op, c_element_op);
 
-    DeviceMem gemm_desc_workspace(gemm.GetWorkSpaceSize(&argument));
+    std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
+    std::size_t kargs_size     = gemm.GetDeviceKernelArgSize(&argument);
 
-    gemm.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer());
+    DeviceMem gemm_workspace, gemm_kargs;
+
+    // The following is necessary since TwoStage kernel is using additional memory both
+    // for Workspace and kernel arguments.
+    if(kargs_size > 0)
+    {
+        gemm_kargs.Realloc(kargs_size);
+        gemm.SetDeviceKernelArgs(&argument, gemm_kargs.GetDeviceBuffer());
+    }
+    if(workspace_size > 0 && workspace_size != kargs_size)
+    {
+        gemm_workspace.Realloc(workspace_size);
+        gemm.SetWorkSpacePointer(&argument, gemm_workspace.GetDeviceBuffer());
+    }
 
     if(!gemm.IsSupportedArgument(argument))
     {
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
index 1e0340553..267a970ee 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
@@ -1,17 +1,87 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include <array>
 #include <iostream>
+#include <sstream>
+#include <stdexcept>
 #include <vector>
 
 #include "device_base.hpp"
+#include "ck/utility/ignore.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 
+///
+/// @brief      Structure representing single GEMM problem arguments.
+///
+///             The pointer to the vector of those structures is passed to the GroupedGEMM entry
+///             point kernel.
+///
+/// @tparam     NumDTensor  The number of D input tensors.
+///
+template <index_t NumDTensor = 0>
+struct GroupedGemmKernelArgument
+{
+    __host__ __device__ GroupedGemmKernelArgument(const void* p_a_grid_,
+                                                  const void* p_b_grid_,
+                                                  std::array<const void*, NumDTensor> p_ds_grid_,
+                                                  void* p_e_grid_,
+                                                  index_t M_,
+                                                  index_t N_,
+                                                  index_t K_,
+                                                  index_t StrideA_,
+                                                  index_t StrideB_,
+                                                  std::array<index_t, NumDTensor> StrideDs_,
+                                                  index_t StrideE_)
+        : p_a_grid{p_a_grid_},
+          p_b_grid{p_b_grid_},
+          p_ds_grid{p_ds_grid_},
+          p_e_grid{p_e_grid_},
+          M{M_},
+          N{N_},
+          K{K_},
+          StrideA{StrideA_},
+          StrideB{StrideB_},
+          StrideDs{StrideDs_},
+          StrideE{StrideE_}
+    {
+    }
+
+    const void* p_a_grid;
+    const void* p_b_grid;
+    std::array<const void*, NumDTensor> p_ds_grid;
+    void* p_e_grid;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t StrideA;
+    index_t StrideB;
+    std::array<index_t, NumDTensor> StrideDs;
+    index_t StrideE;
+
+    void Print() const
+    {
+        std::stringstream str;
+        for(auto sd : StrideDs)
+            str << sd << ",";
+
+        std::cout << "arg {"
+                  << "M:" << M << ", "
+                  << "N:" << N << ", "
+                  << "K:" << K << ", "
+                  << "SA:" << StrideA << ", "
+                  << "SB:" << StrideB << ", "
+                  << "SE:" << StrideE << ", "
+                  << "SDs: {" << str.str() << "}"
+                  << "}" << std::endl;
+    }
+};
+
 struct GemmDesc
 {
     ck::index_t M_, N_, K_;
@@ -48,6 +118,66 @@ struct DeviceGroupedGemm : public BaseOperator
                         CElementwiseOperation c_element_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    //---------------------------------------------------------------------------------------------
+    /// @brief      Sets the device kernel arguments pointer and may copy data to device.
+    ///
+    /// TODO: Add which kernels are using this (TileLoop * FixedNK ??)
+    ///
+    /// @param      p_arg               The pointer to the Argument we're going to update.
+    /// @param[in]  p_dev_kernel_args   The pointer to the device memory which will contain kernel
+    ///                                 arguments.
+    /// @param[in]  p_host_kernel_args  The pointer to the host memory which contains kernel
+    ///                                 arguments that should be copied to device memory.
+    ///
+    virtual void SetDeviceKernelArgs(BaseArgument* p_arg,
+                                     void* p_dev_kernel_args,
+                                     const void* p_host_kernel_args) const
+    {
+        ignore = p_arg;
+        ignore = p_dev_kernel_args;
+        ignore = p_host_kernel_args;
+
+        std::ostringstream err;
+        err << "This function is not implemented by the kernel: " << this->GetTypeString()
+            << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
+    }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the device kernel arguments pointer and may copy data to device.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_dev_kernel_args  The pointer to the device memory which contains kernel
+    ///                                arguments.
+    ///
+    virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const
+    {
+        ignore = p_arg;
+        ignore = p_dev_kernel_args;
+
+        std::ostringstream err;
+        err << "This function is not implemented by the kernel: " << this->GetTypeString()
+            << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
+    }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Gets the device kernel argument size.
+    ///
+    /// @param[in]  p_arg  The pointer to the Device op Argument.
+    ///
+    /// @return     The device kernel argument size.
+    ///
+    virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const
+    {
+        ignore = p_arg;
+
+        std::ostringstream err;
+        err << "This function is not implemented by the kernel: " << this->GetTypeString()
+            << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
index fcb2ba6a4..780a0c30c 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
@@ -1,35 +1,14 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
-#include <iostream>
-#include <array>
-
-#include "device_grouped_gemm.hpp"
+#include "device_grouped_gemm_splitk.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 
-template <index_t NumDTensor = 0>
-struct GroupedGemmKernelArgument
-{
-    const void* p_a_grid;
-    const void* p_b_grid;
-    std::array<const void*, NumDTensor> p_ds_grid;
-    void* p_e_grid;
-
-    index_t M;
-    index_t N;
-    index_t K;
-
-    index_t StrideA;
-    index_t StrideB;
-    std::array<index_t, NumDTensor> StrideDs;
-    index_t StrideE;
-};
-
 template <typename ALayout,
           typename BLayout,
           typename DsLayout,
@@ -41,21 +20,18 @@ template <typename ALayout,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation>
-struct DeviceGroupedGemmFixedNK : DeviceGroupedGemm<ALayout,
-                                                    BLayout,
-                                                    DsLayout,
-                                                    ELayout,
-                                                    ADataType,
-                                                    BDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    AElementwiseOperation,
-                                                    BElementwiseOperation,
-                                                    CElementwiseOperation>
+struct DeviceGroupedGemmFixedNK : DeviceGroupedGemmSplitK<ALayout,
+                                                          BLayout,
+                                                          DsLayout,
+                                                          ELayout,
+                                                          ADataType,
+                                                          BDataType,
+                                                          DsDataType,
+                                                          EDataType,
+                                                          AElementwiseOperation,
+                                                          BElementwiseOperation,
+                                                          CElementwiseOperation>
 {
-    virtual void SetDeviceKernelArgs(BaseArgument* p_arg, const void* kernel_args) const = 0;
-    virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const               = 0;
-    virtual void SetKBatch(BaseArgument* p_arg, index_t k_batch) const                   = 0;
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp
deleted file mode 100644
index d91eac073..000000000
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <array>
-#include <iostream>
-#include <vector>
-#include <sstream>
-
-#include "device_grouped_gemm.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-
-///
-/// @brief      Structure representing single GEMM problem arguments.
-///
-///             The pointer to the vector of those structures is passed to the GroupedGEMM entry
-///             point kernel.
-///
-/// @tparam     NumDTensor  The number of D input tensors.
-///
-template <index_t NumDTensor = 0>
-struct GroupedGemmMultipleDKernelArguments
-{
-    __host__ __device__
-    GroupedGemmMultipleDKernelArguments(const void* p_a_grid_,
-                                        const void* p_b_grid_,
-                                        std::array<const void*, NumDTensor> p_ds_grid_,
-                                        void* p_e_grid_,
-                                        index_t M_,
-                                        index_t N_,
-                                        index_t K_,
-                                        index_t StrideA_,
-                                        index_t StrideB_,
-                                        std::array<index_t, NumDTensor> StrideDs_,
-                                        index_t StrideE_)
-        : p_a_grid{p_a_grid_},
-          p_b_grid{p_b_grid_},
-          p_ds_grid{p_ds_grid_},
-          p_e_grid{p_e_grid_},
-          M{M_},
-          N{N_},
-          K{K_},
-          StrideA{StrideA_},
-          StrideB{StrideB_},
-          StrideDs{StrideDs_},
-          StrideE{StrideE_}
-    {
-    }
-
-    const void* p_a_grid;
-    const void* p_b_grid;
-    std::array<const void*, NumDTensor> p_ds_grid;
-    void* p_e_grid;
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t StrideA;
-    index_t StrideB;
-    std::array<index_t, NumDTensor> StrideDs;
-    index_t StrideE;
-
-    void Print() const
-    {
-        std::stringstream str;
-        for(auto sd : StrideDs)
-            str << sd << ",";
-
-        std::cout << "arg {"
-                  << "M:" << M << ", "
-                  << "N:" << N << ", "
-                  << "K:" << K << ", "
-                  << "SA:" << StrideA << ", "
-                  << "SB:" << StrideB << ", "
-                  << "SE:" << StrideE << ", "
-                  << "SDs: {" << str.str() << "}"
-                  << "}" << std::endl;
-    }
-};
-
-template <typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
-struct DeviceGroupedGemmMultipleDSplitK : public DeviceGroupedGemm<ALayout,
-                                                                   BLayout,
-                                                                   DsLayout,
-                                                                   ELayout,
-                                                                   ADataType,
-                                                                   BDataType,
-                                                                   DsDataType,
-                                                                   EDataType,
-                                                                   AElementwiseOperation,
-                                                                   BElementwiseOperation,
-                                                                   CDEElementwiseOperation>
-{
-    //----------------------------------------------------------------------------------------------
-    /// @brief      Sets the k batch size.
-    ///
-    /// @param      p_arg   Pointer to the Argument we're going to change.
-    /// @param[in]  kbatch  The kbatch value.
-    ///
-    virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0;
-
-    //----------------------------------------------------------------------------------------------
-    /// @brief      Sets the device kernel arguments pointer.
-    ///
-    /// @param      p_arg              The pointer to the Argument we're going to update.
-    /// @param[in]  p_dev_kernel_args  The pointer to the device memory which contains kernel
-    ///                                arguments.
-    ///
-    virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const = 0;
-
-    //----------------------------------------------------------------------------------------------
-    /// @brief      Gets the device kernel argument size.
-    ///
-    /// @param[in]  p_arg  The pointer to the Device op Argument.
-    ///
-    /// @return     The device kernel argument size.
-    ///
-    virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0;
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
index 06d180d30..3ea650190 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include <iostream>
-#include <vector>
 
 #include "device_grouped_gemm.hpp"
 
@@ -31,7 +31,23 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm<ALayout,
                                                           BElementwiseOperation,
                                                           CElementwiseOperation>
 {
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the k batch size.
+    ///
+    /// @param      p_arg   Pointer to the Argument we're going to change.
+    /// @param[in]  kbatch  The kbatch value.
+    ///
     virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0;
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the k batch size.
+    ///
+    /// @param      p_arg   Pointer to the Argument we're going to change.
+    /// @param[in]  kbatch  The kbatch value.
+    ///
+    virtual void SetKBatch(BaseArgument* p_arg, index_t kbatch) const
+    {
+        this->SetKBatchSize(p_arg, kbatch);
+    };
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp
index c1030f31c..712fbfd9e 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp
@@ -3,83 +3,20 @@
 
 #pragma once
 
-#include <array>
-#include <iostream>
-#include <vector>
-#include <sstream>
-
 #include "device_grouped_gemm.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 
+/// @brief Grouped GEMM kernel using output Tile Looping algorithm
 ///
-/// @brief      Structure representing single GEMM problem arguments.
-///
-///             The pointer to the vector of those structures is passed to the GroupedGEMM entry
-///             point kernel.
-///
-/// @tparam     NumDTensor  The number of D input tensors.
+/// @par This kernel does not require any knowledge about input data sizes (GEMM M/N/K)
+///       It requires only the number of groups to launch. Other information like
+///       data pointers and GEMM sizes, packed into gemm kernel args may be all dynamic
+///       (known only at kernel run-time).
 ///
-template <index_t NumDTensor = 0>
-struct GroupedGemmTileLoopKernelArguments
-{
-    __host__ __device__
-    GroupedGemmTileLoopKernelArguments(const void* p_a_grid_,
-                                       const void* p_b_grid_,
-                                       std::array<const void*, NumDTensor> p_ds_grid_,
-                                       void* p_e_grid_,
-                                       index_t M_,
-                                       index_t N_,
-                                       index_t K_,
-                                       index_t StrideA_,
-                                       index_t StrideB_,
-                                       std::array<index_t, NumDTensor> StrideDs_,
-                                       index_t StrideE_)
-        : p_a_grid{p_a_grid_},
-          p_b_grid{p_b_grid_},
-          p_ds_grid{p_ds_grid_},
-          p_e_grid{p_e_grid_},
-          M{M_},
-          N{N_},
-          K{K_},
-          StrideA{StrideA_},
-          StrideB{StrideB_},
-          StrideDs{StrideDs_},
-          StrideE{StrideE_}
-    {
-    }
-
-    const void* p_a_grid;
-    const void* p_b_grid;
-    std::array<const void*, NumDTensor> p_ds_grid;
-    void* p_e_grid;
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t StrideA;
-    index_t StrideB;
-    std::array<index_t, NumDTensor> StrideDs;
-    index_t StrideE;
-
-    void Print() const
-    {
-        std::stringstream str;
-        for(auto sd : StrideDs)
-            str << sd << ",";
-
-        std::cout << "arg {"
-                  << "M:" << M << ", "
-                  << "N:" << N << ", "
-                  << "K:" << K << ", "
-                  << "SA:" << StrideA << ", "
-                  << "SB:" << StrideB << ", "
-                  << "SE:" << StrideE << ", "
-                  << "SDs: {" << str.str() << "}"
-                  << "}" << std::endl;
-    }
-};
+/// @note This kernel does not support SplitK.
 
 template <typename ALayout,
           typename BLayout,
@@ -104,23 +41,6 @@ struct DeviceGroupedGemmTileLoop : public DeviceGroupedGemm<ALayout,
                                                             BElementwiseOperation,
                                                             CDEElementwiseOperation>
 {
-    //----------------------------------------------------------------------------------------------
-    /// @brief      Sets the device kernel arguments pointer.
-    ///
-    /// @param      p_arg              The pointer to the Argument we're going to update.
-    /// @param[in]  p_dev_kernel_args  The pointer to the device memory which contains kernel
-    ///                                arguments.
-    ///
-    virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const = 0;
-
-    //----------------------------------------------------------------------------------------------
-    /// @brief      Gets the device kernel argument size.
-    ///
-    /// @param[in]  p_arg  The pointer to the Device op Argument.
-    ///
-    /// @return     The device kernel argument size.
-    ///
-    virtual size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const = 0;
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
index 68c6dcc0f..0535c8032 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -18,7 +18,6 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
@@ -78,17 +77,17 @@ template <typename ALayout,
           // TODO: change gridwise_gemm_v2r4r2 to support AK1 & BK1
           enable_if_t<AK1 == BK1, bool> = false>
 struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
-    : public DeviceGroupedGemmMultipleDSplitK<ALayout,
-                                              BLayout,
-                                              DsLayout,
-                                              ELayout,
-                                              ADataType,
-                                              BDataType,
-                                              DsDataType,
-                                              EDataType,
-                                              AElementwiseOperation,
-                                              BElementwiseOperation,
-                                              CDEElementwiseOperation>
+    : public DeviceGroupedGemmSplitK<ALayout,
+                                     BLayout,
+                                     DsLayout,
+                                     ELayout,
+                                     ADataType,
+                                     BDataType,
+                                     DsDataType,
+                                     EDataType,
+                                     AElementwiseOperation,
+                                     BElementwiseOperation,
+                                     CDEElementwiseOperation>
 {
     using DeviceOp = DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage;
 
@@ -530,7 +529,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         index_t skipped_group_count_;
         index_t grid_size_;
         // Pointer to device memory with GEMM kernel arguments.
-        const void* p_dev_gemm_args_;
+        void* p_dev_gemm_kargs_;
 
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
@@ -566,7 +565,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         /// @return     The average kernel execution time (if time measurement is enabled.)
         ///
         float Run(const Argument& arg,
-                  const void* dev_gemm_args,
+                  void* dev_gemm_args,
                   void* dev_gemm_workspace,
                   const StreamConfig& stream_config = StreamConfig{})
         {
@@ -621,7 +620,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         ///
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            if(arg.p_dev_gemm_args_ == nullptr)
+            if(arg.p_dev_gemm_kargs_ == nullptr)
             {
                 std::ostringstream err;
                 err << "The gemm arguments device buffer is not allocated!"
@@ -637,7 +636,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                 throw std::runtime_error(err.str());
             }
 
-            return Run(arg, arg.p_dev_gemm_args_, arg.p_workspace_, stream_config);
+            return Run(arg, arg.p_dev_gemm_kargs_, arg.p_workspace_, stream_config);
         }
 
         float Run(const BaseArgument* p_arg,
@@ -723,7 +722,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
 
         template <bool HasMainKBlockLoop>
         float DispatchKernel(const Argument& arg,
-                             const void* dev_gemm_args,
+                             void* dev_gemm_kargs,
                              void* dev_gemm_workspace,
                              const StreamConfig& stream_config) const
         {
@@ -746,7 +745,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
             return LaunchKernel(gemm_kernel,
                                 elementwise_kernel,
                                 arg,
-                                dev_gemm_args,
+                                dev_gemm_kargs,
                                 dev_gemm_workspace,
                                 stream_config);
         }
@@ -755,12 +754,19 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         float LaunchKernel(const KernelFunction& gemm_kernel,
                            const KernelFunction2& elementwise_kernel,
                            const Argument& arg,
-                           const void* dev_gemm_args,
+                           void* dev_gemm_kargs,
                            [[maybe_unused]] void* dev_gemm_workspace,
                            const StreamConfig& stream_config) const
         {
             float time{0.f};
 
+            hip_check_error(
+                hipMemcpyWithStream(dev_gemm_kargs,
+                                    arg.gemm_kernel_args_.data(),
+                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream_config.stream_id_));
+
             auto preprocess = [&]() {
                 hip_check_error(hipMemsetAsync(
                     dev_gemm_workspace, 0, arg.GetWorkspaceSizeBytes(), stream_config.stream_id_));
@@ -774,7 +780,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                 dim3(arg.grid_size_),
                 dim3(BlockSize),
                 0,
-                cast_pointer_to_constant_address_space(dev_gemm_args),
+                cast_pointer_to_constant_address_space(dev_gemm_kargs),
                 arg.gemm_kernel_args_.size(),
                 arg.a_element_op_,
                 arg.b_element_op_,
@@ -930,18 +936,30 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
         return str.str();
     }
 
-    void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
     {
-        arg.p_dev_gemm_args_ = p_dev_kernel_args;
-        hip_check_error(hipMemcpy(p_dev_kernel_args,
-                                  arg.gemm_kernel_args_.data(),
-                                  GetDeviceKernelArgSize(&arg),
-                                  hipMemcpyHostToDevice));
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->p_dev_gemm_kargs_ = p_dev_kernel_args;
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
     }
 
-    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
     {
-        return SetDeviceKernelArgs(*dynamic_cast<Argument*>(p_arg), p_dev_kernel_args);
+        auto arg = dynamic_cast<const Argument*>(p_arg);
+        if(arg)
+        {
+            return arg->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
     }
 
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
@@ -974,17 +992,22 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                 "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
     }
 
-    static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); }
-
-    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
+    [[deprecated]] static void SetKBatchSize(Argument& arg, index_t kbatch)
     {
-        return SetKBatchSize(*dynamic_cast<Argument*>(p_arg), kbatch);
+        arg.UpdateKBatch(kbatch);
     }
 
-    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
+    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
     {
-        return dynamic_cast<const Argument*>(p_arg)->gemm_kernel_args_.size() *
-               sizeof(GemmTransKernelArg);
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->UpdateKBatch(kbatch);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
index 2884e558c..f673713f3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -20,7 +20,6 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp" // stare wywalic
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
 
 namespace ck {
@@ -522,7 +521,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         ComputeTypeA,
         ComputeTypeB>;
 
-    using KernelArguments = GroupedGemmTileLoopKernelArguments<NumDTensor>;
+    using KernelArguments = GroupedGemmKernelArgument<NumDTensor>;
     using Block2ETileMap  = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
     using OffsettedLocalBlock2ETileMap = OffsettedBlockToCTileMap2<Block2ETileMap>;
 
@@ -936,12 +935,31 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
         return str.str();
     }
 
+    void SetDeviceKernelArgs(Argument& arg,
+                             void* p_dev_kernel_args,
+                             const void* p_host_kernel_args) const
+    {
+        arg.p_dev_gemm_args_ = p_dev_kernel_args;
+        hip_check_error(hipMemcpy(p_dev_kernel_args,
+                                  p_host_kernel_args,
+                                  GetDeviceKernelArgSize(&arg),
+                                  hipMemcpyHostToDevice));
+    }
+
+    virtual void SetDeviceKernelArgs(BaseArgument* p_arg,
+                                     void* p_dev_kernel_args,
+                                     const void* p_host_kernel_args) const override
+    {
+        return SetDeviceKernelArgs(
+            *dynamic_cast<Argument*>(p_arg), p_dev_kernel_args, p_host_kernel_args);
+    }
+
     void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const
     {
         arg.p_dev_gemm_args_ = p_dev_kernel_args;
     }
 
-    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
     {
         return SetDeviceKernelArgs(*dynamic_cast<Argument*>(p_arg), p_dev_kernel_args);
     }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index 658f32351..86cf1da15 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -717,7 +717,24 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
 
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
     {
-        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmBiasTransKernelArg);
+        auto p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(p_arg_)
+        {
+            return p_arg_->group_count_ * sizeof(GemmBiasTransKernelArg);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemmMultipleDXdlCShuffle::Argument structure!");
+    }
+
+    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
+    {
+        return GetWorkSpaceSize(p_arg);
+    }
+
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    {
+        return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
index ac05a0703..1fee02bad 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -445,6 +445,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
     using Block2ETileMap = BlockToCTileMap_KBatch_M00_N0_M01Adapt_MLoops<MPerBlock, NPerBlock>;
     using GroupedGemmBlock2ETileMap = OffsettedBlockToCTileMapMLoops<Block2ETileMap>;
 
+    // TODO: replace with GroupedGemmKernelArgument
     struct GemmBiasTransKernelArg
     {
         // pointers
@@ -900,40 +901,58 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
         return str.str();
     }
 
-    static void SetDeviceKernelArgs(Argument& arg, const void* kernel_args)
-    {
-        arg.grouped_gemm_kernel_args_dev = kernel_args;
-    }
-
     // polymorphic
-    void SetDeviceKernelArgs(BaseArgument* p_arg, const void* kernel_args) const override
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* kernel_args) const override
     {
-        return SetDeviceKernelArgs(*dynamic_cast<Argument*>(p_arg), kernel_args);
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->grouped_gemm_kernel_args_dev = kernel_args;
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
     }
 
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
     {
-        auto arg = *dynamic_cast<const Argument*>(p_arg);
-
-        return arg.group_count_ * arg.barrier_size_grp_ * sizeof(uint32_t);
+        auto arg_ptr = dynamic_cast<const Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            return arg_ptr->group_count_ * arg_ptr->barrier_size_grp_ * sizeof(uint32_t);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
     }
 
     size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
     {
-        auto arg = *dynamic_cast<const Argument*>(p_arg);
-
-        return arg.group_count_ * sizeof(GroupedGemmKernelArgument<NumDTensor>);
+        auto arg_ptr = dynamic_cast<const Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            return arg_ptr->group_count_ * sizeof(GroupedGemmKernelArgument<NumDTensor>);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
     }
 
     void SetWorkSpacePointer(BaseArgument* p_arg,
                              void* p_workspace,
                              const StreamConfig& stream_config = StreamConfig{}) const override
     {
-        auto p_arg_          = dynamic_cast<Argument*>(p_arg);
-        p_arg_->p_workspace_ = p_workspace;
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->p_workspace_ = p_workspace;
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
 
         hip_check_error(
-            hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(p_arg), stream_config.stream_id_));
+            hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(arg_ptr), stream_config.stream_id_));
     }
 
     static void SetKBatch(Argument& arg, index_t k_batch) { arg.UpdateKBatch(k_batch); }
@@ -941,7 +960,26 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
     // polymorphic
     void SetKBatch(BaseArgument* p_arg, index_t k_batch) const override
     {
-        return SetKBatch(*dynamic_cast<Argument*>(p_arg), k_batch);
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->UpdateKBatch(k_batch);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
+    }
+
+    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
+    {
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->UpdateKBatch(kbatch);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index cb0afbb08..626ffbe97 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -546,7 +546,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         bool supported = true;
         for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
         {
-            const auto& a        = arg.gemm_kernel_args_[i].karg_;
+            const auto& a = arg.gemm_kernel_args_[i].karg_;
+
             bool group_arg_valid = GridwiseGemm::CheckValidity(a);
             if(not group_arg_valid)
             {
@@ -636,16 +637,42 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
 
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
     {
-        return dynamic_cast<const Argument*>(p_arg)->gemm_kernel_args_.size() *
-               sizeof(GemmTransKernelArg);
+        auto p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(p_arg_)
+        {
+            return p_arg_->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!");
+    }
+
+    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
+    {
+        return GetWorkSpaceSize(p_arg);
     }
 
+    // TODO: deperecation notice.
     static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); }
 
     // polymorphic
     void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
     {
-        return SetKBatchSize(*dynamic_cast<Argument*>(p_arg), kbatch);
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->UpdateKBatch(kbatch);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!");
+    }
+
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    {
+        return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
     }
 };
 
diff --git a/include/ck/utility/loop_scheduler.hpp b/include/ck/utility/loop_scheduler.hpp
index 0c4d85bed..a88109249 100644
--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
@@ -5,7 +5,6 @@
 #pragma once
 
 #include "ck/utility/common_header.hpp"
-#include "ck/tensor_description/tensor_adaptor.hpp"
 
 namespace ck {
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
index 87426fd52..a999f9e3a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -95,6 +95,45 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances);
 
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                   Col,
@@ -189,6 +228,124 @@ void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_in
                                                   PassThrough,
                                                   PassThrough,
                                                   PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 #endif
 
 #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
@@ -262,7 +419,11 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             {
                 add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                 add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
-                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances(
                     op_ptrs);
                 add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
                     op_ptrs);
@@ -334,12 +495,34 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             {
                 add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instances(
                     op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<ELayout, Row>)
             {
                 add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instances(
                     op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances(
+                    op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances(
+                    op_ptrs);
             }
         }
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp
new file mode 100644
index 000000000..7721e42c3
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/utility/loop_scheduler.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto PipelineV1         = ck::PipelineVersion::v1;
+static constexpr auto PipelineV2         = ck::PipelineVersion::v2;
+static constexpr auto DefaultScheduler   = ck::LoopScheduler::Default;
+static constexpr auto InterwaveScheduler = ck::LoopScheduler::Interwave;
+static constexpr auto GemmMNKPadding     = device::GemmSpecialization::MNKPadding;
+static constexpr auto GemmDefault        = device::GemmSpecialization::Default;
+
+template <typename T,
+          device::GemmSpecialization GemmSpec = GemmMNKPadding,
+          PipelineVersion Pipeline            = PipelineV1,
+          LoopScheduler Scheduler             = DefaultScheduler,
+          enable_if_t<sizeof(T) == 2, bool>   = false>
+using device_grouped_gemm_xdl_splitk_2Bt_rrr_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| Pipeline | Loop      |
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| Version  | Scheduler |
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |           |
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |          |           |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>
+    // clang-format on
+    >;
+
+template <typename T,
+          device::GemmSpecialization GemmSpec = GemmMNKPadding,
+          PipelineVersion Pipeline            = PipelineV1,
+          LoopScheduler Scheduler             = DefaultScheduler,
+          enable_if_t<sizeof(T) == 2, bool>   = false>
+using device_grouped_gemm_xdl_splitk_2Bt_rcr_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| Pipeline | Loop      |
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| Version  | Scheduler |
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |           |
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |          |           |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, Pipeline,  Scheduler>
+    // clang-format on
+    >;
+
+template <typename T,
+          device::GemmSpecialization GemmSpec = GemmMNKPadding,
+          PipelineVersion Pipeline            = PipelineV1,
+          LoopScheduler Scheduler             = DefaultScheduler,
+          enable_if_t<sizeof(T) == 2, bool>   = false>
+using device_grouped_gemm_xdl_splitk_2Bt_crr_instances = std::tuple<
+    // clang-format off
+        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| Pipeline | Loop      |
+        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| Version  | Scheduler |
+        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |           |
+        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |          |           |
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 2, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>,
+        DeviceGroupedGemmXdlSplitKCShuffle<    Col,    Row, Empty_Tuple,    Row,     T,     T,     F32,        T, Empty_Tuple,     T, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,   Pipeline,  Scheduler>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
index de2032194..4a3e1a4ad 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
@@ -4,12 +4,30 @@ add_instance_library(device_grouped_gemm_instance
    device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
    device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
    device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
-   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+   
    device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
-   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
    device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+   
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp
+   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp
+   
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp
+   
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp
+
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp
+   device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp
+
    device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
    device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instance.cpp
+
    device_grouped_gemm_multiple_d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp
    device_grouped_gemm_multiple_d_splitk_xdl_two_stage_bf16_bf16_bf16_mk_kn_mn_instance.cpp
    device_grouped_gemm_multiple_d_splitk_xdl_two_stage_bf16_bf16_bf16_mk_nk_mn_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp
new file mode 100644
index 000000000..b8a03871c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_2Bt_crr_instances<BF16, GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp
new file mode 100644
index 000000000..10141165c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter.cpp
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_crr_instances<BF16,
+                                                         GemmMNKPadding,
+                                                         PipelineV1,
+                                                         InterwaveScheduler>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp
new file mode 100644
index 000000000..b96f5983c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_km_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_crr_instances<BF16, GemmMNKPadding, PipelineV2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp
new file mode 100644
index 000000000..8fad42316
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances<BF16, GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp
new file mode 100644
index 000000000..7845136ca
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter.cpp
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_rrr_instances<BF16,
+                                                         GemmMNKPadding,
+                                                         PipelineV1,
+                                                         InterwaveScheduler>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp
new file mode 100644
index 000000000..a2d79edf6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_rrr_instances<BF16, GemmMNKPadding, PipelineV2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp
new file mode 100644
index 000000000..033a2929f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances<BF16, GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp
new file mode 100644
index 000000000..cf8c94bf4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter.cpp
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_rcr_instances<BF16,
+                                                         GemmMNKPadding,
+                                                         PipelineV1,
+                                                         InterwaveScheduler>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp
new file mode 100644
index 000000000..70c0d703e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_bf16_bf16_bf16_mk_nk_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  BF16,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_rcr_instances<BF16, GemmMNKPadding, PipelineV2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
index 98e476f8b..077a8a18c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -1,53 +1,14 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using Empty_Tuple = ck::Tuple<>;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-
-// a[m, k] * b[k, n] = e[m, n]
-using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
-    // clang-format off
-        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
-    // clang-format on
-    >;
-
 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                   Row,
@@ -61,8 +22,8 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances<F16, GemmDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
deleted file mode 100644
index ed0a8c7b7..000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using Empty_Tuple = ck::Tuple<>;
-
-using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
-        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1>,
-
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v1,  LoopScheduler::Interwave>,
-
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8, PipelineVersion::v2>
-    // clang-format on
-    >;
-
-void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
-                                                  Row,
-                                                  Empty_Tuple,
-                                                  Row,
-                                                  F16,
-                                                  F16,
-                                                  Empty_Tuple,
-                                                  F16,
-                                                  PassThrough,
-                                                  PassThrough,
-                                                  PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp
new file mode 100644
index 000000000..8ad4736ac
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rrr_instances<F16, GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp
new file mode 100644
index 000000000..1d968c821
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter.cpp
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv1_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_rrr_instances<F16,
+                                                         GemmMNKPadding,
+                                                         PipelineV1,
+                                                         InterwaveScheduler>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp
new file mode 100644
index 000000000..ee3d7d73b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_pv2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_gemm_xdl_splitk_2Bt_rrr_instances<F16, GemmMNKPadding, PipelineV2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
index aa6365cd9..085e74f0c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,57 +1,14 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using Empty_Tuple = ck::Tuple<>;
-
-using PassThrough                 = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-
-// a[m, k] * b[n, k] = e[m, n]
-using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // clang-format off
-        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-    // clang-format on
-    >;
-
 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                   Col,
@@ -65,8 +22,8 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances<F16, GemmDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
index f4460b360..320bb933b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
@@ -1,63 +1,14 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using Empty_Tuple = ck::Tuple<>;
-
-using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
-        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,    
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   256,    32,   8,   8,   32,   32,    1,    4,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>
-    // clang-format on
-    >;
-
 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
     std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                   Col,
@@ -72,7 +23,7 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
                                                   PassThrough>>>& instances)
 {
     add_device_operation_instances(
-        instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances<F16, GemmMNKPadding>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp
deleted file mode 100644
index c98328e52..000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using BF16 = ck::bhalf_t;
-using I8   = int8_t;
-using F32  = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
-using Multiply            = ck::tensor_operation::element_wise::Multiply;
-using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
-using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
-using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
-
-static constexpr auto GemmDefault    = GemmSpecialization::Default;
-static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
-static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
-static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
-
-static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
-static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
-
-template <typename DsLayout,
-          typename DsDataType,
-          typename CDEElementwiseOp,
-          GemmSpecialization GemmSpec = GemmMNKPadding>
-using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
-        //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   224,   256,    64,   8,   4,   16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,         0,           1,           2,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   256,    32,   8,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
-
-    // clang-format on
-    >;
-
-template <typename DsLayout,
-          typename DsDataType,
-          typename CDEElementwiseOp,
-          GemmSpecialization GemmSpec                 = GemmMNKPadding,
-          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave>
-using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances =
-    std::tuple<
-        // clang-format off
-        //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
-        // Latency friendly
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,    64,    16,    16,   256,   8,   4,   16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 4>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    32,   256,   8,   4,   16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        // Memory friendly
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,    64,    16,    16,   256,   8,   4,   16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 4>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    32,   256,   8,   4,   16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    64,   128,   8,   4,   16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    32,    64,   128,   8,   4,   32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,   128,    64,   8,   4,   16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    32,   128,    64,   8,   4,   32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,    16,   256,    64,   8,   4,   16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,              S<1, 16, 1, 16>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,    32,   256,    64,   8,   4,   32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,              S<1, 16, 1, 16>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-        // clang-format on
-        >;
-
-void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemmTileLoop<Row,
-                                                          Row,
-                                                          ck::Tuple<Row>,
-                                                          Row,
-                                                          BF16,
-                                                          I8,
-                                                          ck::Tuple<BF16>,
-                                                          BF16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          Multiply>>>& instances)
-{
-    // comp
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmMNKPadding>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmMNPadding>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmKPadding>{});
-    // mem
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmDefault,
-                                                                              Intrawave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNKPadding,
-                                                                              Intrawave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNPadding,
-                                                                              Intrawave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmKPadding,
-                                                                              Intrawave>{});
-
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmDefault,
-                                                                              Interwave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNKPadding,
-                                                                              Interwave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNPadding,
-                                                                              Interwave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmKPadding,
-                                                                              Interwave>{});
-}
-
-void add_device_grouped_gemm_xdl_tile_loop_multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemmTileLoop<Row,
-                                                          Row,
-                                                          ck::Tuple<Row, Row>,
-                                                          Row,
-                                                          BF16,
-                                                          I8,
-                                                          ck::Tuple<BF16, BF16>,
-                                                          BF16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyAddFastGelu>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances<
-            ck::Tuple<Row, Row>,
-            ck::Tuple<BF16, BF16>,
-            MultiplyAddFastGelu>{});
-}
-
-void add_device_grouped_gemm_xdl_tile_loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemmTileLoop<Row,
-                                                          Row,
-                                                          ck::Tuple<Row>,
-                                                          Row,
-                                                          BF16,
-                                                          I8,
-                                                          ck::Tuple<BF16>,
-                                                          BF16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyFastGelu>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances<
-            ck::Tuple<Row>,
-            ck::Tuple<BF16>,
-            MultiplyFastGelu>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index 0b73e4fcd..c10cd0ea9 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -17,7 +17,6 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/utility/fill.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
@@ -42,11 +41,14 @@ bool profile_grouped_gemm_impl(int do_verification,
                                const std::vector<int>& StrideAs,
                                const std::vector<int>& StrideBs,
                                const std::vector<int>& StrideCs,
-                               int kbatch   = 1,
-                               int n_warmup = 1,
-                               int n_iter   = 10)
+                               const std::vector<int>& kbatches = {},
+                               int n_warmup                     = 1,
+                               int n_iter                       = 10)
 {
     bool pass = true;
+    // TODO: Fixme - we do not pass compute data type here but need it
+    // to compute error thresholds.
+    using ComputeDataType = ADataType;
 
     auto f_host_tensor_descriptor =
         [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -75,6 +77,7 @@ bool profile_grouped_gemm_impl(int do_verification,
     std::vector<Tensor<CDataType>> c_m_n_host_results;
     std::vector<Tensor<CDataType>> c_m_n_device_results;
 
+    ComputeDataType max_abs_in_val = 0.f;
     for(std::size_t i = 0; i < group_count; i++)
     {
         a_m_k.push_back(
@@ -93,17 +96,18 @@ bool profile_grouped_gemm_impl(int do_verification,
                       << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
                       << "]:" << c_m_n_device_results[i].mDesc << std::endl;
         }
-        std::size_t num_thread = 1;
         switch(init_method)
         {
         case 0: break;
         case 1:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+            ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k[i]);
+            ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n[i]);
+            max_abs_in_val = 2.f;
             break;
         default:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+            ck::utils::FillUniformDistribution<ADataType>{-0.5f, 0.5f}(a_m_k[i]);
+            ck::utils::FillUniformDistribution<BDataType>{-0.5f, 0.5f}(b_k_n[i]);
+            max_abs_in_val = 0.5f;
         }
     }
 
@@ -164,7 +168,20 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                                      BElementOp,
                                                                      CElementOp>;
 
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+    // If kbatch would be bigger than 1, then we will use SplitK version.
+    using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
+                                                                                 BLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 CLayout,
+                                                                                 ADataType,
+                                                                                 BDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 CDataType,
+                                                                                 AElementOp,
+                                                                                 BElementOp,
+                                                                                 CElementOp>;
+
+    auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
 
     if(op_ptrs.size() <= 0)
@@ -205,7 +222,6 @@ bool profile_grouped_gemm_impl(int do_verification,
             ref_invoker.Run(ref_argument);
         }
     }
-
     // profile device GEMM instances
     for(auto& gemm_ptr : op_ptrs)
     {
@@ -221,43 +237,44 @@ bool profile_grouped_gemm_impl(int do_verification,
 
         auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
 
-        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        std::size_t workspace_size = gemm_ptr->GetWorkSpaceSize(argument_ptr.get());
+        std::size_t kargs_size     = gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get());
 
-        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
-        std::string gemm_name = gemm_ptr->GetTypeString();
+        DeviceMem gemm_workspace, gemm_kargs;
 
-        using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
-                                                                                     BLayout,
-                                                                                     ck::Tuple<>,
-                                                                                     CLayout,
-                                                                                     ADataType,
-                                                                                     BDataType,
-                                                                                     ck::Tuple<>,
-                                                                                     CDataType,
-                                                                                     AElementOp,
-                                                                                     BElementOp,
-                                                                                     CElementOp>;
-
-        // skip non-splitk grouped_gemm
-        if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) == nullptr)
+        // The following is necessary since TwoStage kernel is using additional memory both
+        // for Workspace and kernel arguments.
+        if(kargs_size > 0)
         {
-            continue;
+            gemm_kargs.Realloc(kargs_size);
+            gemm_ptr->SetDeviceKernelArgs(argument_ptr.get(), gemm_kargs.GetDeviceBuffer());
+        }
+        if(workspace_size > 0 && workspace_size != kargs_size)
+        {
+            gemm_workspace.Realloc(workspace_size);
+            gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_workspace.GetDeviceBuffer());
         }
 
+        std::string gemm_name = gemm_ptr->GetTypeString();
+
         std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64};
 
-        if(kbatch > 0)
+        // If the user will provide not empty kbatches list, then we test predefined set of kbatch
+        // values.
+        if(!kbatches.empty())
         {
-            kbatch_list = {kbatch};
+            kbatch_list = kbatches;
         }
 
         for(std::size_t j = 0; j < kbatch_list.size(); j++)
         {
-
             auto kbatch_curr = kbatch_list[j];
 
-            dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                ->SetKBatchSize(argument_ptr.get(), kbatch_curr);
+            if(kbatch_curr > 1 && dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
+            {
+                dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
+                    ->SetKBatchSize(argument_ptr.get(), kbatch_curr);
+            }
 
             if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
             {
@@ -272,23 +289,18 @@ bool profile_grouped_gemm_impl(int do_verification,
                     bool instance_pass = true;
                     for(std::size_t i = 0; i < gemm_descs.size(); i++)
                     {
-
                         c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
-
-                        if(std::is_same_v<CDataType, ck::half_t> && kbatch_curr > 1)
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i],
-                                                                      "Error: Incorrect results!",
-                                                                      0.06);
-                        }
-                        else
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i]);
-                        }
+                        auto atol = ck::utils::get_absolute_threshold<ComputeDataType, CDataType>(
+                            max_abs_in_val, gemm_descs[i].K_);
+                        auto rtol = ck::utils::get_relative_threshold<ComputeDataType, CDataType>(
+                            gemm_descs[i].K_);
+
+                        instance_pass =
+                            instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                  c_m_n_host_results[i],
+                                                                  "Error: Incorrect results!",
+                                                                  rtol,
+                                                                  atol);
 
                         if(do_log)
                         {
@@ -311,11 +323,12 @@ bool profile_grouped_gemm_impl(int do_verification,
                     pass = pass && instance_pass;
                 }
 
-                float ave_time = invoker_ptr->Run(
-                    argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
-
                 if(time_kernel)
                 {
+                    float ave_time =
+                        invoker_ptr->Run(argument_ptr.get(),
+                                         StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
+
                     std::size_t flop = 0, num_btype = 0;
                     for(std::size_t i = 0; i < gemm_descs.size(); i++)
                     {
diff --git a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
index f66564416..94ee2a37e 100644
--- a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
@@ -143,8 +143,7 @@ bool profile_grouped_gemm_multiply_tile_loop_impl(int do_verification,
     p_ds.reserve(group_count);
     p_e.reserve(group_count);
 
-    using KernelArguments =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+    using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;
 
     std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
     std::vector<KernelArguments> gemm_kargs;
diff --git a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
index 74faf15be..3a4ca24dd 100644
--- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
@@ -127,7 +127,7 @@ bool profile_grouped_gemm_tile_loop_impl(int do_verification,
     p_b.reserve(group_count);
     p_c.reserve(group_count);
 
-    using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<>;
+    using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument<>;
 
     std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
     std::vector<KernelArguments> gemm_kargs;
diff --git a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
deleted file mode 100644
index 14df96d50..000000000
--- a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
+++ /dev/null
@@ -1,367 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iomanip>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/fill.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-
-namespace ck {
-namespace profiler {
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AccDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-bool profile_grouped_gemm_two_stage_impl(int do_verification,
-                                         int init_method,
-                                         bool do_log,
-                                         bool time_kernel,
-                                         const std::vector<int>& Ms,
-                                         const std::vector<int>& Ns,
-                                         const std::vector<int>& Ks,
-                                         const std::vector<int>& StrideAs,
-                                         const std::vector<int>& StrideBs,
-                                         const std::vector<int>& StrideCs,
-                                         int kbatch   = 1,
-                                         int n_warmup = 1,
-                                         int n_iter   = 10)
-{
-    bool pass = true;
-
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
-
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-
-    std::size_t group_count = Ms.size();
-
-    if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() &&
-         group_count == StrideBs.size() && group_count == StrideCs.size()))
-    {
-        throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n");
-    }
-
-    std::vector<Tensor<ADataType>> a_m_k;
-    std::vector<Tensor<BDataType>> b_k_n;
-    std::vector<Tensor<CDataType>> c_m_n_host_results;
-    std::vector<Tensor<CDataType>> c_m_n_device_results;
-
-    for(std::size_t i = 0; i < group_count; i++)
-    {
-        a_m_k.push_back(
-            Tensor<ADataType>(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{})));
-        b_k_n.push_back(
-            Tensor<BDataType>(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{})));
-
-        c_m_n_device_results.push_back(
-            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-
-        c_m_n_host_results.push_back(
-            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-        if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-        {
-            std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n["
-                      << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
-                      << "]:" << c_m_n_device_results[i].mDesc << std::endl;
-        }
-        std::size_t num_thread = 1;
-        switch(init_method)
-        {
-        case 0: break;
-        case 1:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
-            break;
-        default:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
-        }
-    }
-
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    const auto a_element_op = AElementOp{};
-    const auto b_element_op = BElementOp{};
-    const auto c_element_op = CElementOp{};
-
-    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
-    std::vector<DeviceMemPtr> a_device_buf, b_device_buf, c_device_buf;
-
-    a_device_buf.reserve(group_count);
-    b_device_buf.reserve(group_count);
-    c_device_buf.reserve(group_count);
-
-    std::vector<const void*> p_a, p_b;
-    std::vector<void*> p_c;
-
-    p_a.reserve(group_count);
-    p_b.reserve(group_count);
-    p_c.reserve(group_count);
-
-    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
-
-    gemm_descs.reserve(group_count);
-
-    for(std::size_t i = 0; i < group_count; i++)
-    {
-        a_device_buf.emplace_back(
-            std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
-        b_device_buf.emplace_back(
-            std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
-        c_device_buf.emplace_back(std::make_unique<DeviceMem>(
-            sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
-
-        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
-        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
-
-        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
-
-        p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
-        p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
-        p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
-    }
-
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
-                                                                     BLayout,
-                                                                     ck::Tuple<>,
-                                                                     CLayout,
-                                                                     ADataType,
-                                                                     BDataType,
-                                                                     ck::Tuple<>,
-                                                                     CDataType,
-                                                                     AElementOp,
-                                                                     BElementOp,
-                                                                     CElementOp>;
-
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-
-    if(op_ptrs.size() <= 0)
-    {
-        throw std::runtime_error("wrong! no device GEMM instance found");
-    }
-
-    std::string best_gemm_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-    float best_kbatch     = 0;
-
-    auto p_ds = std::vector<std::array<const void*, 0>>{};
-
-    if(do_verification)
-    {
-        for(std::size_t i = 0; i < gemm_descs.size(); i++)
-        {
-            using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                    BDataType,
-                                                                                    CDataType,
-                                                                                    AccDataType,
-                                                                                    AElementOp,
-                                                                                    BElementOp,
-                                                                                    CElementOp>;
-
-            auto ref_gemm    = ReferenceGemmInstance{};
-            auto ref_invoker = ref_gemm.MakeInvoker();
-
-            auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
-                                                      b_k_n[i],
-                                                      c_m_n_host_results[i],
-                                                      a_element_op,
-                                                      b_element_op,
-                                                      c_element_op);
-
-            ref_invoker.Run(ref_argument);
-        }
-    }
-
-    // profile device GEMM instances
-    for(auto& gemm_ptr : op_ptrs)
-    {
-        auto argument_ptr =
-            gemm_ptr->MakeArgumentPointer(p_a,
-                                          p_b,
-                                          p_ds,
-                                          p_c,
-                                          gemm_descs,
-                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          ck::tensor_operation::element_wise::PassThrough{});
-
-        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
-
-        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
-        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
-
-        std::string gemm_name = gemm_ptr->GetTypeString();
-
-        using DeviceOpSplitK =
-            ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitK<ALayout,
-                                                                           BLayout,
-                                                                           ck::Tuple<>,
-                                                                           CLayout,
-                                                                           ADataType,
-                                                                           BDataType,
-                                                                           ck::Tuple<>,
-                                                                           CDataType,
-                                                                           AElementOp,
-                                                                           BElementOp,
-                                                                           CElementOp>;
-
-        // skip non-splitk grouped_gemm
-        if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) == nullptr)
-        {
-            continue;
-        }
-
-        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64};
-
-        if(kbatch > 0)
-        {
-            kbatch_list = {kbatch};
-        }
-
-        for(std::size_t j = 0; j < kbatch_list.size(); j++)
-        {
-
-            auto kbatch_curr = kbatch_list[j];
-            dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                ->SetKBatchSize(argument_ptr.get(), kbatch_curr);
-
-            DeviceMem gemm_arg_dev_mem(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                                           ->GetDeviceKernelArgSize(argument_ptr.get()));
-            dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                ->SetDeviceKernelArgs(argument_ptr.get(), gemm_arg_dev_mem.GetDeviceBuffer());
-
-            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
-            {
-                gemm_desc_workspace.SetZero();
-                for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                    c_device_buf[i]->SetZero();
-
-                invoker_ptr->Run(argument_ptr.get(),
-                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
-                if(do_verification)
-                {
-                    bool instance_pass = true;
-                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                    {
-                        c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
-                        if(std::is_same_v<CDataType, ck::half_t> && kbatch_curr > 1)
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i],
-                                                                      "Error: Incorrect results!",
-                                                                      0.06);
-                        }
-                        else
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i]);
-                        }
-
-                        if(do_log)
-                        {
-                            LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_host  : ", c_m_n_host_results[i].mData, ",")
-                                << std::endl;
-                        }
-                    }
-
-                    std::cout << "Instance: " << gemm_name << " verification "
-                              << (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
-
-                    pass = pass && instance_pass;
-                }
-                float ave_time = invoker_ptr->Run(
-                    argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
-                if(time_kernel)
-                {
-                    std::size_t flop = 0, num_btype = 0;
-                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                    {
-                        flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
-
-                        num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
-                                     sizeof(BDataType) * Ks[i] * Ns[i] +
-                                     sizeof(CDataType) * Ms[i] * Ns[i];
-                    }
-
-                    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-                    float gb_per_sec = num_btype / 1.E6 / ave_time;
-                    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
-                              << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << ", KBatch "
-                              << kbatch_curr << std::endl;
-
-                    if(tflops > best_tflops)
-                    {
-                        best_gemm_name  = gemm_name;
-                        best_tflops     = tflops;
-                        best_ave_time   = ave_time;
-                        best_gb_per_sec = gb_per_sec;
-                        best_kbatch     = kbatch_curr;
-                    }
-                }
-            }
-            else
-            {
-                std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
-                          << std::endl;
-            }
-        }
-    }
-
-    if(time_kernel)
-    {
-        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-                  << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
-                  << std::endl;
-    }
-
-    return pass;
-}
-
-} // namespace profiler
-} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index f079d554b..35e91f817 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -43,7 +43,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
     list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
     list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
-    list(APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp)
     list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
     list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp)
     list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index fbf44d720..2adcd6483 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -39,16 +39,13 @@ namespace {
 std::vector<int> argToIntArray(char* input)
 {
     std::vector<int> out;
-
     std::istringstream in(input);
-
     std::string item;
 
     while(std::getline(in, item, ','))
     {
         out.push_back(std::stoi(item));
     }
-
     return out;
 }
 
@@ -69,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
             << "arg7: time kernel (0=n0, 1=yes)\n"
             << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
                "64,64 64,64 128,128)\n"
-            << "arg15: kbatch value (default 1)\n"
+            << "arg15: kbatch values (default 1)\n"
             << "optional:\n"
             << "arg16: number of warm-up cycles (default 1)\n"
             << "arg17: number of iterations (default 10)\n"
@@ -92,7 +89,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     const auto StrideAs = argToIntArray(argv[11]);
     const auto StrideBs = argToIntArray(argv[12]);
     const auto StrideCs = argToIntArray(argv[13]);
-    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
+    const auto kbatches = argc >= 15 ? argToIntArray(argv[14]) : std::vector<int>{};
 
     int n_warmup = 1;
     int n_iter   = 10;
@@ -102,7 +99,6 @@ int profile_grouped_gemm(int argc, char* argv[])
         n_iter   = std::stoi(argv[16]);
     }
 
-#ifdef CK_ENABLE_FP16
     if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         ck::profiler::profile_grouped_gemm_impl<ck::half_t,
@@ -121,7 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    StrideAs,
                                                                                    StrideBs,
                                                                                    StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                    n_warmup,
                                                                                    n_iter);
     }
@@ -143,7 +139,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    StrideAs,
                                                                                    StrideBs,
                                                                                    StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                    n_warmup,
                                                                                    n_iter);
     }
@@ -165,7 +161,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    StrideAs,
                                                                                    StrideBs,
                                                                                    StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                    n_warmup,
                                                                                    n_iter);
     }
@@ -187,7 +183,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    StrideAs,
                                                                                    StrideBs,
                                                                                    StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                    n_warmup,
                                                                                    n_iter);
     }
@@ -209,7 +205,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    StrideAs,
                                                                                    StrideBs,
                                                                                    StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                    n_warmup,
                                                                                    n_iter);
     }
@@ -231,7 +227,73 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                    StrideAs,
                                                                                    StrideBs,
                                                                                    StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
+                                                                                   n_warmup,
+                                                                                   n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                float,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs,
+                                                                                   kbatches,
+                                                                                   n_warmup,
+                                                                                   n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                float,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs,
+                                                                                   kbatches,
+                                                                                   n_warmup,
+                                                                                   n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                float,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs,
+                                                                                   kbatches,
                                                                                    n_warmup,
                                                                                    n_iter);
     }
@@ -239,7 +301,6 @@ int profile_grouped_gemm(int argc, char* argv[])
     {
         throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
     }
-#endif
     return 0;
 }
 
diff --git a/profiler/src/profile_grouped_gemm_fixed_nk.cpp b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
index de90a33ef..e33d79850 100644
--- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp
+++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
@@ -32,9 +32,7 @@ namespace {
 std::vector<int> argToIntArray(char* input)
 {
     std::vector<int> out;
-
     std::istringstream in(input);
-
     std::string item;
 
     while(std::getline(in, item, ','))
@@ -83,7 +81,7 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
     const auto StrideAs = argToIntArray(argv[11]);
     const auto StrideBs = argToIntArray(argv[12]);
     const auto StrideCs = argToIntArray(argv[13]);
-    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
+    const int kbatch    = argc >= 15 ? std::stoi(argv[14]) : 1;
 
     using F32 = float;
     using F16 = ck::half_t;
@@ -97,8 +95,8 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
     int n_iter   = 10;
     if(argc == 17)
     {
-        n_warmup = std::stoi(argv[16]);
-        n_iter   = std::stoi(argv[17]);
+        n_warmup = std::stoi(argv[15]);
+        n_iter   = std::stoi(argv[16]);
     }
 
 #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
diff --git a/profiler/src/profile_grouped_gemm_two_stage.cpp b/profiler/src/profile_grouped_gemm_two_stage.cpp
deleted file mode 100644
index db37a0b76..000000000
--- a/profiler/src/profile_grouped_gemm_two_stage.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-
-#include "profiler/profile_grouped_gemm_two_stage_impl.hpp"
-#include "profiler_operation_registry.hpp"
-
-enum struct GemmMatrixLayout
-{
-    MK_KN_MN, // 0
-    MK_NK_MN, // 1
-};
-
-enum struct GemmDataType
-{
-    F16_F16_F16,    // 0
-    BF16_INT8_BF16, // 1
-    BF16_BF16_BF16  // 2
-};
-
-#define OP_NAME "grouped_gemm_two_stage"
-#define OP_DESC "Grouped GEMM TwoStage"
-
-namespace {
-
-std::vector<int> argToIntArray(char* input)
-{
-    std::vector<int> out;
-
-    std::istringstream in(input);
-
-    std::string item;
-
-    while(std::getline(in, item, ','))
-    {
-        out.push_back(std::stoi(item));
-    }
-
-    return out;
-}
-
-int profile_grouped_gemm_two_stage(int argc, char* argv[])
-{
-    if(argc < 14)
-    {
-        std::cout
-            << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
-            << "arg2: data type (0: fp16; 1: bf16@int8; 2: bf16)\n"
-            << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n]);\n"
-            << "arg4: verification (0: no; 1: yes)\n"
-            << "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"
-            << "arg6: print tensor value (0: no; 1: yes)\n"
-            << "arg7: time kernel (0=n0, 1=yes)\n"
-            << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
-               "64,64 64,64 128,128)\n"
-            << "arg15: kbatch value (default 1)\n"
-            << "optional:\n"
-            << "arg16: number of warm-up cycles (default 1)\n"
-            << "arg17: number of iterations (default 10)\n"
-            << std::endl;
-
-        exit(1);
-    }
-
-    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
-    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
-    const bool do_verification = std::stoi(argv[4]);
-    const int init_method      = std::stoi(argv[5]);
-    const bool do_log          = std::stoi(argv[6]);
-    const bool time_kernel     = std::stoi(argv[7]);
-
-    const auto Ms = argToIntArray(argv[8]);
-    const auto Ns = argToIntArray(argv[9]);
-    const auto Ks = argToIntArray(argv[10]);
-
-    auto StrideAs    = argToIntArray(argv[11]);
-    auto StrideBs    = argToIntArray(argv[12]);
-    auto StrideCs    = argToIntArray(argv[13]);
-    const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1;
-
-    const int DefaultStrideA = Ks[0];
-    const int DefaultStrideB = Ns[0];
-    const int DefaultStrideC = Ns[0];
-
-    for(size_t i = 0; i < Ms.size(); ++i)
-    {
-        StrideAs[i] = StrideAs[i] == -1 ? DefaultStrideA : StrideAs[i];
-        StrideBs[i] = StrideBs[i] == -1 ? DefaultStrideB : StrideBs[i];
-        StrideCs[i] = StrideCs[i] == -1 ? DefaultStrideC : StrideCs[i];
-    }
-
-    int n_warmup = 1;
-    int n_iter   = 10;
-    if(argc == 17)
-    {
-        n_warmup = std::stoi(argv[16]);
-        n_iter   = std::stoi(argv[17]);
-    }
-
-    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::half_t,
-                                                          ck::half_t,
-                                                          ck::half_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          int8_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          int8_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::ColumnMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::ColumnMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else
-    {
-        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
-    }
-    return 0;
-}
-
-} // anonymous namespace
-
-REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm_two_stage);
diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt
index 55cb20977..f47685cf9 100644
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
@@ -6,12 +6,6 @@ if(result EQUAL 0)
     add_dependencies(test_grouped_gemm test_grouped_gemm_splitk)
 endif()
 
-add_gtest_executable(test_grouped_gemm_two_stage_splitk test_grouped_gemm_two_stage_multiple_d_splitk_xdl.cpp)
-if(result EQUAL 0)
-    target_link_libraries(test_grouped_gemm_two_stage_splitk PRIVATE utility device_grouped_gemm_instance)
-    add_dependencies(test_grouped_gemm test_grouped_gemm_two_stage_splitk)
-endif()
-
 add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface_xdl.cpp)
 if(result EQUAL 0)
     target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance)
diff --git a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
index d9282fa92..74d49eb57 100644
--- a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 #include <vector>
@@ -10,25 +10,35 @@
 #include "gtest/gtest.h"
 #include "test_grouped_gemm_util.hpp"
 
-using F16 = ck::half_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F8   = ck::f8_t;
+using I8   = int8_t;
+
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
-using RRR_F16_F16_F16 = ck::test::TestGroupedGemm<std::tuple<Row, Row, Row, F16, F16, F16>>;
-using RCR_F16_F16_F16 = ck::test::TestGroupedGemm<std::tuple<Row, Col, Row, F16, F16, F16>>;
-
-using RRR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm<std::tuple<Row, Row, Row, F16, F16, F16>>;
-using RCR_F16_F16_F16_LargeK = ck::test::TestGroupedGemm<std::tuple<Row, Col, Row, F16, F16, F16>>;
-
-const std::vector<int> KBATCH{1, 2, 3, 5, 8};
-
-INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_KN, RRR_F16_F16_F16, testing::ValuesIn(KBATCH));
-INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_MK_NK, RCR_F16_F16_F16, testing::ValuesIn(KBATCH));
-INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_KN,
-                         RRR_F16_F16_F16_LargeK,
-                         testing::Values(32, 64));
-INSTANTIATE_TEST_SUITE_P(TestGroupedGemm_splitk_LargeK_MK_NK,
-                         RCR_F16_F16_F16_LargeK,
-                         testing::Values(32, 64));
+template <typename Tuple>
+class TestGroupedGemm : public ck::test::TestGroupedGemm<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<     Row, Row, Row, F16, F16, F16>,
+    std::tuple<     Row, Col, Row, F16, F16, F16>,
+    std::tuple<     Col, Row, Row, F16, F16, F16>,
+    std::tuple<     Col, Col, Row, F16, F16, F16>,
+    std::tuple<     Row, Row, Row, BF16, BF16, BF16>,
+    std::tuple<     Row, Col, Row, BF16, BF16, BF16>,
+    std::tuple<     Col, Row, Row, BF16, BF16, BF16>,
+    std::tuple<     Row, Row, Row, BF16, I8, BF16>,
+    std::tuple<     Row, Col, Row, BF16, I8, BF16>,
+    std::tuple<     Row, Row, Row, F16, F8, F16>,
+    std::tuple<     Row, Row, Row, F8, F16, F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes);
 
 #include "test_grouped_gemm_ut_cases.inc"
diff --git a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
index d94d140d9..f4011cf99 100644
--- a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
+++ b/test/grouped_gemm/test_grouped_gemm_ut_cases.inc
@@ -1,6 +1,6 @@
 #pragma once
 
-TEST_P(RRR_F16_F16_F16, TinyCases)
+TYPED_TEST(TestGroupedGemm, TinyCases)
 {
     const std::vector<int> Ms{0, 1};
     constexpr int N = 768;
@@ -8,14 +8,11 @@ TEST_P(RRR_F16_F16_F16, TinyCases)
 
     const std::vector<int> Ns(Ms.size(), N);
     const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), N);
-    const std::vector<int> StrideCs(Ms.size(), N);
 
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+    this->Run(Ms, Ns, Ks);
 }
 
-TEST_P(RRR_F16_F16_F16, SmallCases)
+TYPED_TEST(TestGroupedGemm, SmallCases)
 {
     const std::vector<int> Ms{2, 1, 3, 4, 5, 0};
     constexpr int N = 768;
@@ -23,14 +20,11 @@ TEST_P(RRR_F16_F16_F16, SmallCases)
 
     const std::vector<int> Ns(Ms.size(), N);
     const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), N);
-    const std::vector<int> StrideCs(Ms.size(), N);
 
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+    this->Run(Ms, Ns, Ks);
 }
 
-TEST_P(RRR_F16_F16_F16, MidCases)
+TYPED_TEST(TestGroupedGemm, MidCases)
 {
     const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
     constexpr int N = 768;
@@ -38,14 +32,11 @@ TEST_P(RRR_F16_F16_F16, MidCases)
 
     const std::vector<int> Ns(Ms.size(), N);
     const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), N);
-    const std::vector<int> StrideCs(Ms.size(), N);
 
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+    this->Run(Ms, Ns, Ks);
 }
 
-TEST_P(RRR_F16_F16_F16, Regular)
+TYPED_TEST(TestGroupedGemm, Regular)
 {
     const std::vector<int> Ms{64, 128, 256};
     constexpr int N = 768;
@@ -53,14 +44,11 @@ TEST_P(RRR_F16_F16_F16, Regular)
 
     const std::vector<int> Ns(Ms.size(), N);
     const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), N);
-    const std::vector<int> StrideCs(Ms.size(), N);
 
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+    this->Run(Ms, Ns, Ks);
 }
 
-TEST_P(RRR_F16_F16_F16, MNKPadded)
+TYPED_TEST(TestGroupedGemm, MNKPadded)
 {
     const std::vector<int> Ms{127, 150, 188, 210};
     constexpr int N = 136;
@@ -68,88 +56,11 @@ TEST_P(RRR_F16_F16_F16, MNKPadded)
 
     const std::vector<int> Ns(Ms.size(), N);
     const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), N);
-    const std::vector<int> StrideCs(Ms.size(), N);
 
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+    this->Run(Ms, Ns, Ks);
 }
 
-TEST_P(RCR_F16_F16_F16, TinyCases)
-{
-    const std::vector<int> Ms{0, 1};
-    constexpr int N = 768;
-    constexpr int K = 544;
-
-    const std::vector<int> Ns(Ms.size(), N);
-    const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), K);
-    const std::vector<int> StrideCs(Ms.size(), N);
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
-}
-
-TEST_P(RCR_F16_F16_F16, SmallCases)
-{
-    const std::vector<int> Ms{2, 1, 3, 4, 5, 0};
-    constexpr int N = 768;
-    constexpr int K = 544;
-
-    const std::vector<int> Ns(Ms.size(), N);
-    const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), K);
-    const std::vector<int> StrideCs(Ms.size(), N);
-
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
-}
-
-TEST_P(RCR_F16_F16_F16, MidCases)
-{
-    const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
-    constexpr int N = 768;
-    constexpr int K = 544;
-
-    const std::vector<int> Ns(Ms.size(), N);
-    const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), K);
-    const std::vector<int> StrideCs(Ms.size(), N);
-
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
-}
-
-TEST_P(RCR_F16_F16_F16, Regular)
-{
-    const std::vector<int> Ms{32, 64, 128, 256};
-    constexpr int N = 768;
-    constexpr int K = 320;
-
-    const std::vector<int> Ns(Ms.size(), N);
-    const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), K);
-    const std::vector<int> StrideCs(Ms.size(), N);
-
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
-}
-
-TEST_P(RCR_F16_F16_F16, MNKPadded)
-{
-    const std::vector<int> Ms{127, 150, 188, 210};
-    constexpr int N = 136;
-    constexpr int K = 280;
-
-    const std::vector<int> Ns(Ms.size(), N);
-    const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), K);
-    const std::vector<int> StrideCs(Ms.size(), N);
-
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
-}
-
-TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch)
+TYPED_TEST(TestGroupedGemm, TestLargeKBatch)
 {
     const std::vector<int> Ms{188, 210};
     constexpr int N = 768;
@@ -157,24 +68,8 @@ TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch)
 
     const std::vector<int> Ns(Ms.size(), N);
     const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), N);
-    const std::vector<int> StrideCs(Ms.size(), N);
-
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
-}
 
-TEST_P(RCR_F16_F16_F16_LargeK, TestLargeKBatch)
-{
-    const std::vector<int> Ms{188, 210};
-    constexpr int N = 768;
-    constexpr int K = 4096;
-
-    const std::vector<int> Ns(Ms.size(), N);
-    const std::vector<int> Ks(Ms.size(), K);
-    const std::vector<int> StrideAs(Ms.size(), K);
-    const std::vector<int> StrideBs(Ms.size(), K);
-    const std::vector<int> StrideCs(Ms.size(), N);
+    this->k_batches_ = {32, 64};
 
-    this->Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, this->GetParam());
+    this->Run(Ms, Ns, Ks);
 }
diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp
index 9e1395b9f..a3ab0e087 100644
--- a/test/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -22,7 +22,6 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/utility/number.hpp"
 #include "profiler/profile_grouped_gemm_impl.hpp"
-#include "profiler/profile_grouped_gemm_two_stage_impl.hpp"
 
 namespace ck {
 namespace test {
@@ -40,7 +39,7 @@ std::string serialize_range(const Range& range)
 }
 
 template <typename Tuple>
-class TestGroupedGemm : public testing::TestWithParam<int>
+class TestGroupedGemm : public testing::Test
 {
     protected:
     using ALayout   = std::tuple_element_t<0, Tuple>;
@@ -50,23 +49,77 @@ class TestGroupedGemm : public testing::TestWithParam<int>
     using BDataType = std::tuple_element_t<4, Tuple>;
     using EDataType = std::tuple_element_t<5, Tuple>;
 
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
     public:
     static constexpr bool verify_     = true;
-    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr int init_method_ = 1; // integer value initialization
     static constexpr bool log_        = false;
     static constexpr bool bench_      = false; // measure kernel performance
+    static constexpr int n_warmup_    = 0;
+    static constexpr int n_iter_      = 1;
+    std::vector<int> k_batches_;
 
-    void SetUp() override {}
+    void SetUp() override { k_batches_ = {1, 2, 3, 5, 8}; }
 
+    private:
+    template <typename Layout>
+    void SetStrides(std::vector<int>& strides,
+                    const std::vector<int>& rows,
+                    const std::vector<int>& cols) const
+    {
+        if(std::is_same_v<Layout, Row>)
+        {
+            for(const auto c : cols)
+            {
+                strides.emplace_back(c);
+            }
+        }
+        else if(std::is_same_v<Layout, Col>)
+        {
+            for(const auto r : rows)
+            {
+                strides.emplace_back(r);
+            }
+        }
+    }
+
+    public:
     void Run(const std::vector<int>& Ms,
              const std::vector<int>& Ns,
              const std::vector<int>& Ks,
-             const std::vector<int>& StrideAs,
-             const std::vector<int>& StrideBs,
-             const std::vector<int>& StrideCs,
-             int kbatch   = 1,
-             int n_warmup = 1,
-             int n_iter   = 10)
+             const std::vector<int>& StrideAs = {},
+             const std::vector<int>& StrideBs = {},
+             const std::vector<int>& StrideCs = {})
+    {
+        std::vector<int> stride_as = StrideAs;
+        std::vector<int> stride_bs = StrideBs;
+        std::vector<int> stride_cs = StrideCs;
+
+        if(stride_as.empty())
+        {
+            SetStrides<ALayout>(stride_as, Ms, Ks);
+        }
+        if(stride_bs.empty())
+        {
+            SetStrides<BLayout>(stride_bs, Ks, Ns);
+        }
+        if(stride_cs.empty())
+        {
+            SetStrides<ELayout>(stride_cs, Ms, Ns);
+        }
+
+        RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches_);
+    }
+
+    void RunSingle(const std::vector<int>& Ms,
+                   const std::vector<int>& Ns,
+                   const std::vector<int>& Ks,
+                   const std::vector<int>& StrideAs,
+                   const std::vector<int>& StrideBs,
+                   const std::vector<int>& StrideCs,
+                   const std::vector<int>& kbatches)
     {
         bool pass = ck::profiler::profile_grouped_gemm_impl<ADataType,
                                                             BDataType,
@@ -84,61 +137,9 @@ class TestGroupedGemm : public testing::TestWithParam<int>
                                                                      StrideAs,
                                                                      StrideBs,
                                                                      StrideCs,
-                                                                     kbatch,
-                                                                     n_warmup,
-                                                                     n_iter);
-        EXPECT_TRUE(pass);
-    }
-};
-
-template <typename Tuple>
-class TestGroupedGemmTwoStage : public testing::TestWithParam<int>
-{
-    protected:
-    using ALayout   = std::tuple_element_t<0, Tuple>;
-    using BLayout   = std::tuple_element_t<1, Tuple>;
-    using ELayout   = std::tuple_element_t<2, Tuple>;
-    using ADataType = std::tuple_element_t<3, Tuple>;
-    using BDataType = std::tuple_element_t<4, Tuple>;
-    using EDataType = std::tuple_element_t<5, Tuple>;
-
-    public:
-    static constexpr bool verify_     = true;
-    static constexpr int init_method_ = 1; // decimal value initialization
-    static constexpr bool log_        = false;
-    static constexpr bool bench_      = false; // measure kernel performance
-
-    void SetUp() override {}
-
-    void Run(const std::vector<int>& Ms,
-             const std::vector<int>& Ns,
-             const std::vector<int>& Ks,
-             const std::vector<int>& StrideAs,
-             const std::vector<int>& StrideBs,
-             const std::vector<int>& StrideCs,
-             int kbatch   = 1,
-             int n_warmup = 1,
-             int n_iter   = 10)
-    {
-        bool pass = ck::profiler::profile_grouped_gemm_two_stage_impl<ADataType,
-                                                                      BDataType,
-                                                                      EDataType,
-                                                                      float,
-                                                                      ALayout,
-                                                                      BLayout,
-                                                                      ELayout>(verify_,
-                                                                               init_method_,
-                                                                               log_,
-                                                                               bench_,
-                                                                               Ms,
-                                                                               Ns,
-                                                                               Ks,
-                                                                               StrideAs,
-                                                                               StrideBs,
-                                                                               StrideCs,
-                                                                               kbatch,
-                                                                               n_warmup,
-                                                                               n_iter);
+                                                                     kbatches,
+                                                                     n_warmup_,
+                                                                     n_iter_);
         EXPECT_TRUE(pass);
     }
 };
@@ -263,7 +264,7 @@ struct DeviceGroupedGemmSplitkInstanceWrapper
             p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{});
         if(kbatch > 1)
         {
-            ggemm_instance.SetKBatchSize(argument, kbatch);
+            ggemm_instance.SetKBatchSize(&argument, kbatch);
         }
 
         return ggemm_instance.IsSupportedArgument(argument);
@@ -300,13 +301,13 @@ struct DeviceGroupedGemmSplitkInstanceWrapper
             p_As, p_Bs, p_Ds, p_Cs, gemm_descs, PassThrough{}, PassThrough{}, PassThrough{});
         if(kbatch > 1)
         {
-            ggemm_instance.SetKBatchSize(argument, kbatch);
+            ggemm_instance.SetKBatchSize(&argument, kbatch);
         }
 
         EXPECT_TRUE(ggemm_instance.IsSupportedArgument(argument));
         auto invoker = ggemm_instance.MakeInvoker();
-        DeviceMem gemm_desc_workspace(ggemm_instance.GetWorkSpaceSize(&argument));
-        ggemm_instance.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer());
+        DeviceMem dev_gemm_kargs(ggemm_instance.GetDeviceKernelArgSize(&argument));
+        ggemm_instance.SetDeviceKernelArgs(&argument, dev_gemm_kargs.GetDeviceBuffer());
         return invoker.Run(argument, StreamConfig{nullptr, false});
     }
 };
-- 
GitLab


From fe6b185b97e9f9875ef470884e9f9fba17be02d5 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 27 Nov 2024 06:12:56 -0800
Subject: [PATCH 086/153] move utility headers from library/include to include
 path (#1697)

---
 codegen/CMakeLists.txt                                           | 1 +
 {library/include => include}/ck/library/utility/algorithm.hpp    | 0
 {library/include => include}/ck/library/utility/check_err.hpp    | 0
 {library/include => include}/ck/library/utility/conv_common.hpp  | 0
 .../utility/convolution_host_tensor_descriptor_helper.hpp        | 0
 .../ck/library/utility/convolution_parameter.hpp                 | 0
 .../include => include}/ck/library/utility/device_memory.hpp     | 0
 {library/include => include}/ck/library/utility/fill.hpp         | 0
 .../include => include}/ck/library/utility/host_common_util.hpp  | 0
 {library/include => include}/ck/library/utility/host_gemm.hpp    | 0
 {library/include => include}/ck/library/utility/host_tensor.hpp  | 0
 .../ck/library/utility/host_tensor_generator.hpp                 | 0
 {library/include => include}/ck/library/utility/iterator.hpp     | 0
 {library/include => include}/ck/library/utility/literals.hpp     | 0
 {library/include => include}/ck/library/utility/numeric.hpp      | 0
 {library/include => include}/ck/library/utility/ranges.hpp       | 0
 16 files changed, 1 insertion(+)
 rename {library/include => include}/ck/library/utility/algorithm.hpp (100%)
 rename {library/include => include}/ck/library/utility/check_err.hpp (100%)
 rename {library/include => include}/ck/library/utility/conv_common.hpp (100%)
 rename {library/include => include}/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp (100%)
 rename {library/include => include}/ck/library/utility/convolution_parameter.hpp (100%)
 rename {library/include => include}/ck/library/utility/device_memory.hpp (100%)
 rename {library/include => include}/ck/library/utility/fill.hpp (100%)
 rename {library/include => include}/ck/library/utility/host_common_util.hpp (100%)
 rename {library/include => include}/ck/library/utility/host_gemm.hpp (100%)
 rename {library/include => include}/ck/library/utility/host_tensor.hpp (100%)
 rename {library/include => include}/ck/library/utility/host_tensor_generator.hpp (100%)
 rename {library/include => include}/ck/library/utility/iterator.hpp (100%)
 rename {library/include => include}/ck/library/utility/literals.hpp (100%)
 rename {library/include => include}/ck/library/utility/numeric.hpp (100%)
 rename {library/include => include}/ck/library/utility/ranges.hpp (100%)

diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 1ca0d1282..45c47672b 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
+configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
 
 find_package(ROCM)
 include(ROCMInstallTargets)
diff --git a/library/include/ck/library/utility/algorithm.hpp b/include/ck/library/utility/algorithm.hpp
similarity index 100%
rename from library/include/ck/library/utility/algorithm.hpp
rename to include/ck/library/utility/algorithm.hpp
diff --git a/library/include/ck/library/utility/check_err.hpp b/include/ck/library/utility/check_err.hpp
similarity index 100%
rename from library/include/ck/library/utility/check_err.hpp
rename to include/ck/library/utility/check_err.hpp
diff --git a/library/include/ck/library/utility/conv_common.hpp b/include/ck/library/utility/conv_common.hpp
similarity index 100%
rename from library/include/ck/library/utility/conv_common.hpp
rename to include/ck/library/utility/conv_common.hpp
diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
similarity index 100%
rename from library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
rename to include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
diff --git a/library/include/ck/library/utility/convolution_parameter.hpp b/include/ck/library/utility/convolution_parameter.hpp
similarity index 100%
rename from library/include/ck/library/utility/convolution_parameter.hpp
rename to include/ck/library/utility/convolution_parameter.hpp
diff --git a/library/include/ck/library/utility/device_memory.hpp b/include/ck/library/utility/device_memory.hpp
similarity index 100%
rename from library/include/ck/library/utility/device_memory.hpp
rename to include/ck/library/utility/device_memory.hpp
diff --git a/library/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp
similarity index 100%
rename from library/include/ck/library/utility/fill.hpp
rename to include/ck/library/utility/fill.hpp
diff --git a/library/include/ck/library/utility/host_common_util.hpp b/include/ck/library/utility/host_common_util.hpp
similarity index 100%
rename from library/include/ck/library/utility/host_common_util.hpp
rename to include/ck/library/utility/host_common_util.hpp
diff --git a/library/include/ck/library/utility/host_gemm.hpp b/include/ck/library/utility/host_gemm.hpp
similarity index 100%
rename from library/include/ck/library/utility/host_gemm.hpp
rename to include/ck/library/utility/host_gemm.hpp
diff --git a/library/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
similarity index 100%
rename from library/include/ck/library/utility/host_tensor.hpp
rename to include/ck/library/utility/host_tensor.hpp
diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp
similarity index 100%
rename from library/include/ck/library/utility/host_tensor_generator.hpp
rename to include/ck/library/utility/host_tensor_generator.hpp
diff --git a/library/include/ck/library/utility/iterator.hpp b/include/ck/library/utility/iterator.hpp
similarity index 100%
rename from library/include/ck/library/utility/iterator.hpp
rename to include/ck/library/utility/iterator.hpp
diff --git a/library/include/ck/library/utility/literals.hpp b/include/ck/library/utility/literals.hpp
similarity index 100%
rename from library/include/ck/library/utility/literals.hpp
rename to include/ck/library/utility/literals.hpp
diff --git a/library/include/ck/library/utility/numeric.hpp b/include/ck/library/utility/numeric.hpp
similarity index 100%
rename from library/include/ck/library/utility/numeric.hpp
rename to include/ck/library/utility/numeric.hpp
diff --git a/library/include/ck/library/utility/ranges.hpp b/include/ck/library/utility/ranges.hpp
similarity index 100%
rename from library/include/ck/library/utility/ranges.hpp
rename to include/ck/library/utility/ranges.hpp
-- 
GitLab


From e7b6286441aae59d3a87db67f42369d3cc2636a4 Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Wed, 27 Nov 2024 18:25:07 +0100
Subject: [PATCH 087/153] Add interwave scheduler for gemm mem pipeline (#1647)

* add interwave scheduler for gemm mem pipeline

* Fix merge artifacts.

* Refactor unit tests.

* Switch to interwave scheduler for mem example

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: Adam Osewski <Adam.Osewski@amd.com>
---
 example/ck_tile/03_gemm/gemm_mem_pipeline.cpp |   3 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |   3 +-
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   | 224 ++++++++++++++++++
 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp  |  19 +-
 .../gemm/test_gemm_mem_pipeline_ut_cases.inc  |  59 ++++-
 .../gemm/test_gemm_mem_pipeline_util.hpp      |  25 +-
 6 files changed, 311 insertions(+), 22 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
index 97d150412..cd9d9d96b 100644
--- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
+++ b/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
@@ -30,7 +30,6 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
     constexpr ck_tile::index_t M_Warp_Tile = 32;
     constexpr ck_tile::index_t N_Warp_Tile = 32;
     constexpr ck_tile::index_t K_Warp_Tile = 8;
-
 #else
     // Compute friendly for Intrawave scheduler
     constexpr ck_tile::index_t M_Tile = 256;
@@ -84,7 +83,7 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
                                                   AccDataType,
                                                   GemmShape,
                                                   Traits,
-                                                  ck_tile::GemmPipelineScheduler::Intrawave,
+                                                  ck_tile::GemmPipelineScheduler::Interwave,
                                                   has_hot_loop_v,
                                                   tail_number_v>>;
         using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 5199c1e3e..a1fc15577 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -200,7 +200,8 @@ int run_gemm_example(int argc, char* argv[])
         return run_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
     }
     // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not
-    // work. else if(a_layout == "C" && b_layout == "C")
+    // work.
+    // else if(a_layout == "C" && b_layout == "C")
     // {
     //     return run_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
     // }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index 4634e9dcb..847c5b187 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -322,6 +322,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                     block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
 
                     block_sync_lds();
+
                     LocalPrefill(a_copy_lds_window,
                                  a_block_tiles.get(number<prefetch_idx>{}),
                                  a_element_func);
@@ -374,6 +375,229 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
         }
     };
 
+    template <>
+    struct PipelineImpl<GemmPipelineScheduler::Interwave>
+    {
+        template <typename DstBlockTile, typename SrcTileWindow>
+        CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile,
+                                           SrcTileWindow& dram_tile_window) const
+        {
+            load_tile(dst_block_tile, dram_tile_window);
+            move_tile_window(dram_tile_window, {0, KPerBlock});
+        }
+
+        template <typename DstTileWindow, typename SrcBlockTile, typename ElementFunction>
+        CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
+                                         const SrcBlockTile& src_block_tile,
+                                         const ElementFunction& element_func) const
+        {
+            const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
+            store_tile(lds_tile_window, block_tile_tmp);
+        }
+
+        template <bool HasHotLoop,
+                  TailNumber TailNum,
+                  typename ADramBlockWindowTmp,
+                  typename BDramBlockWindowTmp,
+                  typename AElementFunction,
+                  typename BElementFunction>
+        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                       const AElementFunction& a_element_func,
+                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BElementFunction& b_element_func,
+                                       index_t num_loop,
+                                       void* p_smem) const
+        {
+            static_assert(
+                std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                    std::is_same_v<BDataType,
+                                   remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+                "A/B Dram block window should have the same data type as appropriate "
+                "([A|B]DataType) defined in Problem definition!");
+
+            static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                              NPerBlock ==
+                                  BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                              KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                          "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock"
+                          " or KPerBlock!");
+
+            // ------------------------------------------------------------------------------------
+            // Definitions of all needed tiles
+
+            // A tile in LDS
+            ADataType* p_a_lds              = static_cast<ADataType*>(p_smem);
+            constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+            auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+            // TODO: LDS alignment should come from Policy!
+            constexpr index_t a_lds_block_space_size_aligned =
+                integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(),
+                                    16) *
+                16;
+
+            // B tile in LDS
+            BDataType* p_b_lds = static_cast<BDataType*>(
+                static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
+            constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+            auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+
+            // A DRAM tile window for load
+            auto a_copy_dram_window =
+                make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 a_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeADramTileDistribution<Problem>());
+
+            // A LDS tile window for store
+            auto a_copy_lds_window =
+                make_tile_window(a_lds_block,
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 a_copy_dram_window.get_tile_distribution());
+            // B DRAM tile window for load
+            auto b_copy_dram_window =
+                make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 b_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeBDramTileDistribution<Problem>());
+
+            // B LDS tile window for store
+            auto b_copy_lds_window =
+                make_tile_window(b_lds_block,
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 b_copy_dram_window.get_tile_distribution());
+
+            // A LDS tile for block GEMM
+            auto a_lds_gemm_window = make_tile_window(
+                a_lds_block, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+            // B LDS tile for block GEMM
+            auto b_lds_gemm_window = make_tile_window(
+                b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+            // Block GEMM
+            auto block_gemm   = BlockGemm();
+            auto c_block_tile = block_gemm.MakeCBlockTile();
+
+            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
+            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+
+            using ABlockTile =
+                decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+            using BBlockTile =
+                decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+
+            tuple_array<ABlockTile, PrefetchStages> a_block_tiles;
+            tuple_array<BBlockTile, PrefetchStages> b_block_tiles;
+
+            // -----------------------------------------------------------------------------------------
+            // Gemm pipeline start
+
+            // prefetch
+            // global read 0
+            GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window);
+            GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window);
+
+            // initialize C
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            // LDS write 0
+            LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
+            LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
+
+            // Global prefetch [1, PrefetchStages]
+            static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) {
+                GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}), a_copy_dram_window);
+                GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}), b_copy_dram_window);
+            });
+
+            // main body
+            if constexpr(HasHotLoop)
+            {
+                index_t i = 0;
+                do
+                {
+                    static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) {
+                        block_sync_lds();
+                        block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+                        // no second block_sync_lds because it's interwave
+
+                        LocalPrefill(
+                            a_copy_lds_window,
+                            a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
+                            a_element_func);
+                        LocalPrefill(
+                            b_copy_lds_window,
+                            b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
+                            b_element_func);
+
+                        GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
+                                       a_copy_dram_window);
+                        GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
+                                       b_copy_dram_window);
+                    });
+
+                    i += PrefetchStages;
+                } while(i < (num_loop - PrefetchStages));
+            }
+
+            auto HotLoopTail = [&](auto tail_num) {
+                static_for<1, tail_num, 1>{}([&](auto prefetch_idx) {
+                    block_sync_lds();
+                    block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+                    // no second block_sync_lds because it's interwave
+
+                    LocalPrefill(a_copy_lds_window,
+                                 a_block_tiles.get(number<prefetch_idx>{}),
+                                 a_element_func);
+                    LocalPrefill(b_copy_lds_window,
+                                 b_block_tiles.get(number<prefetch_idx>{}),
+                                 b_element_func);
+                });
+
+                block_sync_lds();
+                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+            };
+
+            if constexpr(TailNum == TailNumber::One)
+            {
+                block_sync_lds();
+                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+            }
+            else if constexpr(TailNum == TailNumber::Two)
+            {
+                HotLoopTail(number<2>{});
+            }
+            else if constexpr(TailNum == TailNumber::Three)
+            {
+                HotLoopTail(number<3>{});
+            }
+            else if constexpr(TailNum == TailNumber::Four)
+            {
+                HotLoopTail(number<4>{});
+            }
+            else if constexpr(TailNum == TailNumber::Five)
+            {
+                HotLoopTail(number<5>{});
+            }
+            else if constexpr(TailNum == TailNumber::Six)
+            {
+                HotLoopTail(number<6>{});
+            }
+            else if constexpr(TailNum == TailNumber::Seven)
+            {
+                HotLoopTail(number<7>{});
+            }
+            else if constexpr(TailNum == TailNumber::Full)
+            {
+                HotLoopTail(number<PrefetchStages>{});
+            }
+
+            return c_block_tile;
+        }
+    };
+
     template <typename ADramBlockWindowTmp,
               typename BDramBlockWindowTmp,
               typename AElementFunction,
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
index f72a80b5a..a1c80fee4 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
@@ -11,8 +11,20 @@
 using F16 = ck_tile::half_t;
 using F32 = float;
 
-using Row = ck_tile::tensor_layout::gemm::RowMajor;
-using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+using Row                       = ck_tile::tensor_layout::gemm::RowMajor;
+using Col                       = ck_tile::tensor_layout::gemm::ColumnMajor;
+static constexpr auto Intrawave = ck_tile::GemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = ck_tile::GemmPipelineScheduler::Interwave;
+
+template <typename Tuple>
+class TestCkTileGemmMemPipelineIntrawave : public TestCkTileGemmMemPipeline<Tuple, Intrawave>
+{
+};
+
+template <typename Tuple>
+class TestCkTileGemmMemPipelineInterwave : public TestCkTileGemmMemPipeline<Tuple, Interwave>
+{
+};
 
 // clang-format off
 using KernelTypes = ::testing::Types<
@@ -24,6 +36,7 @@ using KernelTypes = ::testing::Types<
     >;
 // clang-format on
 
-TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes);
+TYPED_TEST_SUITE(TestCkTileGemmMemPipelineIntrawave, KernelTypes);
+TYPED_TEST_SUITE(TestCkTileGemmMemPipelineInterwave, KernelTypes);
 
 #include "test_gemm_mem_pipeline_ut_cases.inc"
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
index b26114f39..6b914e797 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
@@ -1,6 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once
 
-TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
+//------------------------------------------------------------------------------------------------
+//              INTERWAVE SCHEDULER
+//------------------------------------------------------------------------------------------------
+
+TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 1024;
+    constexpr int K = 320;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
+
+TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 1024;
+    constexpr int K = 320;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
+
+TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 1024;
+    constexpr int K = 432;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
+
+TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 1024;
+    constexpr int K = 512;
+
+    for(int M : Ms)
+        this->Run(M, N, K);
+}
+
+//------------------------------------------------------------------------------------------------
+//              INTRAWAVE SCHEDULER
+//------------------------------------------------------------------------------------------------
+
+TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 1024;
@@ -10,7 +61,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
+TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 1024;
@@ -20,7 +71,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
+TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 1024;
@@ -30,7 +81,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
+TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 1024;
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
index 6b4789833..15f9f516e 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
@@ -11,20 +11,21 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
-template <typename Tuple>
+template <typename Tuple, ck_tile::GemmPipelineScheduler Scheduler_>
 class TestCkTileGemmMemPipeline : public ::testing::Test
 {
     protected:
-    using ALayout     = std::tuple_element_t<0, Tuple>;
-    using BLayout     = std::tuple_element_t<1, Tuple>;
-    using CLayout     = std::tuple_element_t<2, Tuple>;
-    using ADataType   = std::tuple_element_t<3, Tuple>;
-    using BDataType   = std::tuple_element_t<4, Tuple>;
-    using AccDataType = std::tuple_element_t<5, Tuple>;
-    using CDataType   = std::tuple_element_t<6, Tuple>;
+    using ALayout                   = std::tuple_element_t<0, Tuple>;
+    using BLayout                   = std::tuple_element_t<1, Tuple>;
+    using CLayout                   = std::tuple_element_t<2, Tuple>;
+    using ADataType                 = std::tuple_element_t<3, Tuple>;
+    using BDataType                 = std::tuple_element_t<4, Tuple>;
+    using AccDataType               = std::tuple_element_t<5, Tuple>;
+    using CDataType                 = std::tuple_element_t<6, Tuple>;
+    static constexpr auto Scheduler = Scheduler_;
     // TODO: expose tile size through test t-param ?
 
-    struct gemm_basic_args
+    struct gemm_args
     {
         const void* p_a;
         const void* p_b;
@@ -38,7 +39,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
         ck_tile::index_t stride_C;
     };
 
-    void invoke_gemm(const gemm_basic_args& args, const ck_tile::stream_config& s)
+    void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s)
     {
         // TODO: This should be parameterized in tests
         constexpr ck_tile::index_t M_Tile = 128;
@@ -89,7 +90,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
                                                       AccDataType,
                                                       GemmShape,
                                                       Traits,
-                                                      ck_tile::GemmPipelineScheduler::Intrawave,
+                                                      Scheduler,
                                                       has_hot_loop_v,
                                                       tail_number_v>>;
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
@@ -288,7 +289,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        gemm_basic_args args;
+        gemm_args args;
         args.p_a      = a_m_k_dev_buf.GetDeviceBuffer();
         args.p_b      = b_k_n_dev_buf.GetDeviceBuffer();
         args.p_c      = c_m_n_dev_buf.GetDeviceBuffer();
-- 
GitLab


From f49b595dc02f3a40b61455c6914e8456b5f42f41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Thu, 28 Nov 2024 17:51:49 +0100
Subject: [PATCH 088/153] [CK TILE] Add gemm compute pipeline v3 (#1661)

* [CK TILE] Add gemm compute pipeline v3

* Enable universal gemm compute pipeline.

* Rename example and add compute pipeline.

* Introduce ag bg cr pipeline impl base.

* Refactor to reuse code.

* Cleaning

* Formatting.

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: Adam Osewski <Adam.Osewski@amd.com>
---
 example/ck_tile/03_gemm/CMakeLists.txt        |   2 +-
 ...mm_mem_pipeline.cpp => universal_gemm.cpp} |  25 +-
 include/ck_tile/ops/gemm.hpp                  |   2 +
 .../block/block_universal_gemm_as_bs_cr.hpp   | 223 +++++-----
 .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp  | 111 +++++
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        | 383 ++++++++++++++++++
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   | 266 ++++--------
 7 files changed, 712 insertions(+), 300 deletions(-)
 rename example/ck_tile/03_gemm/{gemm_mem_pipeline.cpp => universal_gemm.cpp} (89%)
 create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
 create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp

diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 8ae46cadc..d166eed45 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
-add_executable(tile_example_gemm_mem_pipeline EXCLUDE_FROM_ALL gemm_mem_pipeline.cpp)
+add_executable(tile_example_universal_gemm EXCLUDE_FROM_ALL universal_gemm.cpp)
diff --git a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
similarity index 89%
rename from example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
rename to example/ck_tile/03_gemm/universal_gemm.cpp
index cd9d9d96b..eaafc13b9 100644
--- a/example/ck_tile/03_gemm/gemm_mem_pipeline.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -14,10 +14,17 @@
 #include "ck_tile/host.hpp"
 #include "gemm_basic.hpp"
 
+#define CK_TILE_PIPELINE_COMPUTE 1
+#define CK_TILE_PIPELINE_MEMORY 2
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE
+#endif
+
 template <typename ALayout, typename BLayout, typename CLayout>
 float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 {
-#if 1
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
     // Memory friendly for Interwave scheduler
     constexpr ck_tile::index_t M_Tile = 128;
     constexpr ck_tile::index_t N_Tile = 32;
@@ -30,7 +37,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
     constexpr ck_tile::index_t M_Warp_Tile = 32;
     constexpr ck_tile::index_t N_Warp_Tile = 32;
     constexpr ck_tile::index_t K_Warp_Tile = 8;
-#else
+
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
     // Compute friendly for Intrawave scheduler
     constexpr ck_tile::index_t M_Tile = 256;
     constexpr ck_tile::index_t N_Tile = 256;
@@ -63,8 +71,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
         ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>;
 
     using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
     using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+    using BaseGemmPipeline                 = ck_tile::BaseGemmPipelineAgBgCrCompV3<
+#endif
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
 
     const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
@@ -77,13 +88,21 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
         constexpr bool has_hot_loop_v = has_hot_loop_.value;
         constexpr auto tail_number_v  = tail_number_.value;
 
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
         using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+        using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<
+#endif
             ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                   BDataType,
                                                   AccDataType,
                                                   GemmShape,
                                                   Traits,
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
                                                   ck_tile::GemmPipelineScheduler::Interwave,
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+                                                  ck_tile::GemmPipelineScheduler::Intrawave,
+#endif
                                                   has_hot_loop_v,
                                                   tail_number_v>>;
         using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 9a033ee2d..1340fb204 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -25,6 +25,8 @@
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index 5f98a7a0b..c9e648f43 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -41,13 +41,16 @@ struct BlockUniversalGemmAsBsCr
         static constexpr index_t MWarp = config.template at<1>();
         static constexpr index_t NWarp = config.template at<2>();
 
-        static_assert(MWarp == BlockGemmShape::BlockWarps::at(number<0>{}),
+        using I0 = number<0>;
+        using I1 = number<1>;
+
+        static_assert(MWarp == BlockGemmShape::BlockWarps::at(I0{}),
                       "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!");
-        static_assert(NWarp == BlockGemmShape::BlockWarps::at(number<1>{}),
+        static_assert(NWarp == BlockGemmShape::BlockWarps::at(I1{}),
                       "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!");
-        static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(number<0>{}),
+        static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(I0{}),
                       "Error! WarpGemm's M is not consisten with BlockGemmShape!");
-        static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(number<1>{}),
+        static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(I1{}),
                       "Error! WarpGemm's N is not consisten with BlockGemmShape!");
 
         static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
@@ -99,6 +102,9 @@ struct BlockUniversalGemmAsBsCr
 
     static constexpr auto Scheduler = Traits::Scheduler;
 
+    using I0 = number<0>;
+    using I1 = number<1>;
+
     private:
     template <GemmPipelineScheduler Scheduler, typename GemmTraits>
     struct BlockGemmImpl
@@ -114,35 +120,31 @@ struct BlockUniversalGemmAsBsCr
                                        const ASmemBlockWindow& a_block_window,
                                        const BSmemBlockWindow& b_block_window)
         {
-            static_assert(
-                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
-                "The CDataType as defined in traits should be the same as correspoinding "
-                "C block tensor data type!");
-            static_assert(std::is_same_v<typename GemmTraits::ADataType,
-                                         typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<typename GemmTraits::BDataType,
-                                             typename BSmemBlockWindow::DataType>,
+            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                          "The CDataType as defined in traits should be the same as correspoinding "
+                          "C block tensor data type!");
+            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
                           "The ADataType and BDataType as defined in "
                           "traits should be the same as correspoinding block window data type!");
 
             static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
                 "MPerBlock, NPerBlock, KPerBlock defined in "
                 " BlockGemmShape are different from A/B block smem windows apropriate dims!");
 
-            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+            const index_t iMWarp = get_warp_id() / NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);
 
             // TODO: refactor warp_window tile type to class member as it should be
             // compile-time known information.
             auto a_warp_window_tmp = make_tile_window(
                 a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
-                a_block_window.get_window_origin() +
-                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
+                a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0},
+                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));
 
             using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
 
@@ -156,16 +158,15 @@ struct BlockUniversalGemmAsBsCr
 
             statically_indexed_array<
                 statically_indexed_array<AWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::MIterPerWarp>
+                MIterPerWarp>
                 a_warp_windows;
 
             // construct B-warp-window
             auto b_warp_window_tmp = make_tile_window(
                 b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
-                b_block_window.get_window_origin() +
-                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
+                b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0},
+                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));
 
             using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
 
@@ -179,10 +180,10 @@ struct BlockUniversalGemmAsBsCr
 
             statically_indexed_array<
                 statically_indexed_array<BWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::NIterPerWarp>
+                NIterPerWarp>
                 b_warp_windows;
 
-            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                 static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                     a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
 
@@ -193,7 +194,7 @@ struct BlockUniversalGemmAsBsCr
                 });
             });
 
-            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                 static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                     b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
@@ -203,8 +204,8 @@ struct BlockUniversalGemmAsBsCr
                 });
             });
 
-            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
-            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+            using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;
 
             constexpr auto c_warp_y_lengths =
                 to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
@@ -212,10 +213,10 @@ struct BlockUniversalGemmAsBsCr
 
             // hot loop:
             static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                     const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));
 
-                    static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                         const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));
 
                         // read C warp tensor from C block tensor-
@@ -226,7 +227,7 @@ struct BlockUniversalGemmAsBsCr
                             merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                         // warp GEMM
-                        typename GemmTraits::WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile);
+                        WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile);
 
                         // write C warp tensor into C block tensor
                         c_block_tensor.set_y_sliced_thread_data(
@@ -243,13 +244,13 @@ struct BlockUniversalGemmAsBsCr
     struct BlockGemmImpl<GemmPipelineScheduler::Intrawave, GemmTraits>
     {
         statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::AWarpTile, GemmTraits::KIterPerWarp>,
-            GemmTraits::MIterPerWarp>
+            statically_indexed_array<typename GemmTraits::AWarpTile, KIterPerWarp>,
+            MIterPerWarp>
             a_warp_tiles_;
 
         statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::BWarpTile, GemmTraits::KIterPerWarp>,
-            GemmTraits::NIterPerWarp>
+            statically_indexed_array<typename GemmTraits::BWarpTile, KIterPerWarp>,
+            NIterPerWarp>
             b_warp_tiles_;
 
         template <typename ASmemBlockWindow, typename BSmemBlockWindow>
@@ -257,30 +258,27 @@ struct BlockUniversalGemmAsBsCr
                                           const BSmemBlockWindow& b_block_window)
         {
             static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
                 "MPerBlock, NPerBlock, KPerBlock defined in "
                 " BlockGemmShape are different from A/B block smem windows apropriate dims!");
 
-            static_assert(std::is_same_v<typename GemmTraits::ADataType,
-                                         typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<typename GemmTraits::BDataType,
-                                             typename BSmemBlockWindow::DataType>,
+            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
                           "The ADataType and BDataType as defined in "
                           "traits should be the same as correspoinding block window data type!");
 
-            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+            const index_t iMWarp = get_warp_id() / NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);
 
             // TODO: refactor warp_window tile type to class member as it should be
             // compile-time known information.
             auto a_warp_window_tmp = make_tile_window(
                 a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
-                a_block_window.get_window_origin() +
-                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
+                a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0},
+                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));
 
             using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
 
@@ -292,18 +290,16 @@ struct BlockUniversalGemmAsBsCr
                               AWarpWindow{}.get_window_lengths(),
                           "AWarpWindow lengths must be equal to AWarpTile lengths!");
 
-            statically_indexed_array<
-                statically_indexed_array<AWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::MIterPerWarp>
+            statically_indexed_array<statically_indexed_array<AWarpWindow, KIterPerWarp>,
+                                     MIterPerWarp>
                 a_warp_windows;
 
             // construct B-warp-window
             auto b_warp_window_tmp = make_tile_window(
                 b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
-                b_block_window.get_window_origin() +
-                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
+                b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0},
+                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));
 
             using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
 
@@ -315,13 +311,12 @@ struct BlockUniversalGemmAsBsCr
                               BWarpWindow{}.get_window_lengths(),
                           "BWarpWindow lengths must be equal to BWarpTile lengths!");
 
-            statically_indexed_array<
-                statically_indexed_array<BWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::NIterPerWarp>
+            statically_indexed_array<statically_indexed_array<BWarpWindow, KIterPerWarp>,
+                                     NIterPerWarp>
                 b_warp_windows;
 
-            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
                     a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
 
                     // TODO: I don't have to move 0,0 window!
@@ -331,8 +326,8 @@ struct BlockUniversalGemmAsBsCr
                 });
             });
 
-            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
-                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
                     b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
                     move_tile_window(b_warp_windows(nIter)(kIter),
@@ -341,12 +336,12 @@ struct BlockUniversalGemmAsBsCr
                 });
             });
 
-            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                     // read A warp tensor from A block window
                     load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
                 });
-                static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                     // read B warp tensor from B Block window
                     load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
                 });
@@ -359,22 +354,21 @@ struct BlockUniversalGemmAsBsCr
                                        [[maybe_unused]] const ASmemBlockWindow& a_block_window,
                                        [[maybe_unused]] const BSmemBlockWindow& b_block_window)
         {
-            static_assert(
-                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
-                "The CDataType as defined in traits should be the same as correspoinding "
-                "C block tensor data type!");
+            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                          "The CDataType as defined in traits should be the same as correspoinding "
+                          "C block tensor data type!");
 
-            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
-            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+            using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;
 
             constexpr auto c_warp_y_lengths =
                 to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
             constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
             // hot loop:
-            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
-                    static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                         // read C warp tensor from C block tensor-
                         CWarpTensor c_warp_tensor;
 
@@ -383,9 +377,9 @@ struct BlockUniversalGemmAsBsCr
                             merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                         // warp GEMM
-                        typename GemmTraits::WarpGemm{}(c_warp_tensor,
-                                                        a_warp_tiles_[mIter][kIter],
-                                                        b_warp_tiles_[nIter][kIter]);
+                        WarpGemm{}(c_warp_tensor,
+                                   a_warp_tiles_[mIter][kIter],
+                                   b_warp_tiles_[nIter][kIter]);
 
                         // write C warp tensor into C block tensor
                         c_block_tensor.set_y_sliced_thread_data(
@@ -412,12 +406,12 @@ struct BlockUniversalGemmAsBsCr
 
         statically_indexed_array<
             statically_indexed_array<typename GemmTraits::AWarpTile, KInnerLoopIter>,
-            GemmTraits::MIterPerWarp>
+            MIterPerWarp>
             a_warp_tiles_;
 
         statically_indexed_array<
             statically_indexed_array<typename GemmTraits::BWarpTile, KInnerLoopIter>,
-            GemmTraits::NIterPerWarp>
+            NIterPerWarp>
             b_warp_tiles_;
 
         template <index_t KIdx, typename ASmemBlockWindow, typename BSmemBlockWindow>
@@ -425,30 +419,28 @@ struct BlockUniversalGemmAsBsCr
                                           const BSmemBlockWindow& b_block_window)
         {
             static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
                 "MPerBlock, NPerBlock, KPerBlock defined in "
                 " BlockGemmShape are different from A/B block smem windows apropriate dims!");
 
-            static_assert(std::is_same_v<typename GemmTraits::ADataType,
-                                         typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<typename GemmTraits::BDataType,
-                                             typename BSmemBlockWindow::DataType>,
+            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
                           "The ADataType and BDataType as defined in "
                           "traits should be the same as correspoinding block window data type!");
 
-            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+            const index_t iMWarp = get_warp_id() / NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);
 
             // TODO: refactor warp_window tile type to class member as it should be
             // compile-time known information.
             auto a_warp_window_tmp = make_tile_window(
                 a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
+                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
                 a_block_window.get_window_origin() +
-                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, KIdx * KPerInnerLoop},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+                    multi_index<2>{iMWarp * WarpGemm::kM, KIdx * KPerInnerLoop},
+                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));
 
             using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
 
@@ -461,16 +453,16 @@ struct BlockUniversalGemmAsBsCr
                           "AWarpWindow lengths must be equal to AWarpTile lengths!");
 
             statically_indexed_array<statically_indexed_array<AWarpWindow, KInnerLoopIter>,
-                                     GemmTraits::MIterPerWarp>
+                                     MIterPerWarp>
                 a_warp_windows;
 
             // construct B-warp-window
             auto b_warp_window_tmp = make_tile_window(
                 b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
+                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
                 b_block_window.get_window_origin() +
-                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, KIdx * KPerInnerLoop},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+                    multi_index<2>{iNWarp * WarpGemm::kN, KIdx * KPerInnerLoop},
+                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));
 
             using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
 
@@ -483,10 +475,10 @@ struct BlockUniversalGemmAsBsCr
                           "BWarpWindow lengths must be equal to BWarpTile lengths!");
 
             statically_indexed_array<statically_indexed_array<BWarpWindow, KInnerLoopIter>,
-                                     GemmTraits::NIterPerWarp>
+                                     NIterPerWarp>
                 b_warp_windows;
 
-            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                 static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
                     a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
 
@@ -496,7 +488,7 @@ struct BlockUniversalGemmAsBsCr
                 });
             });
 
-            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                 static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
                     b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
@@ -508,11 +500,11 @@ struct BlockUniversalGemmAsBsCr
 
             // TODO check if a_warp_tiles has same desc as a_warp_window
             static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                     // read A warp tensor from A block window
                     load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
                 });
-                static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                     // read B warp tensor from B Block window
                     load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
                 });
@@ -525,13 +517,12 @@ struct BlockUniversalGemmAsBsCr
                                        const ASmemBlockWindow& a_block_window,
                                        const BSmemBlockWindow& b_block_window)
         {
-            static_assert(
-                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
-                "The CDataType as defined in traits should be the same as correspoinding "
-                "C block tensor data type!");
+            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                          "The CDataType as defined in traits should be the same as correspoinding "
+                          "C block tensor data type!");
 
-            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
-            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+            using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;
 
             constexpr auto c_warp_y_lengths =
                 to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
@@ -555,8 +546,8 @@ struct BlockUniversalGemmAsBsCr
                 }
 
                 static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) {
-                    static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
-                        static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                             // read C warp tensor from C block tensor-
                             CWarpTensor c_warp_tensor;
 
@@ -573,17 +564,17 @@ struct BlockUniversalGemmAsBsCr
                             // penalty
                             if constexpr(kIter.value == KRepeat - 1 &&
                                          kInnerIter.value == KInnerLoopIter - 1 &&
-                                         mIter.value == GemmTraits::MIterPerWarp - 1 &&
-                                         nIter.value == GemmTraits::NIterPerWarp - 1)
+                                         mIter.value == MIterPerWarp - 1 &&
+                                         nIter.value == NIterPerWarp - 1)
                             {
                                 __builtin_amdgcn_sched_barrier(0);
                                 block_sync_lds();
                                 __builtin_amdgcn_sched_barrier(0);
                             }
                             // warp GEMM
-                            typename GemmTraits::WarpGemm{}(c_warp_tensor,
-                                                            a_warp_tiles_[mIter][kInnerIter],
-                                                            b_warp_tiles_[nIter][kInnerIter]);
+                            WarpGemm{}(c_warp_tensor,
+                                       a_warp_tiles_[mIter][kInnerIter],
+                                       b_warp_tiles_[nIter][kInnerIter]);
 
                             // write C warp tensor into C block tensor
                             c_block_tensor.set_y_sliced_thread_data(
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
new file mode 100644
index 000000000..431534af1
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename Policy>
+struct GemmPipelineAgBgCrImplBase
+{
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    template <typename DstBlockTile, typename SrcTileWindow>
+    CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile,
+                                       SrcTileWindow& dram_tile_window) const
+    {
+        load_tile(dst_block_tile, dram_tile_window);
+        move_tile_window(dram_tile_window, {0, KPerBlock});
+    }
+
+    template <typename DstTileWindow, typename SrcBlockTile, typename ElementFunction>
+    CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
+                                     const SrcBlockTile& src_block_tile,
+                                     const ElementFunction& element_func) const
+    {
+        const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
+        store_tile(lds_tile_window, block_tile_tmp);
+    }
+
+    CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const
+    {
+        // A tile in LDS
+        ADataType* p_a_lds              = static_cast<ADataType*>(p_smem);
+        constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        // TODO: LDS alignment should come from Policy!
+        constexpr index_t a_lds_block_space_size_aligned =
+            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            16;
+
+        // B tile in LDS
+        BDataType* p_b_lds = static_cast<BDataType*>(
+            static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
+        constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+        auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+
+        return make_tuple(std::move(a_lds_block), std::move(b_lds_block));
+    }
+
+    template <typename ADramBlockWindowTmp, typename ALdsTensorView>
+    CK_TILE_DEVICE auto GetAWindows(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                    const ALdsTensorView& a_lds_block_view) const
+    {
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window =
+            make_tile_window(a_lds_block_view,
+                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             a_copy_dram_window.get_tile_distribution());
+
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block_view, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+        return make_tuple(std::move(a_copy_dram_window),
+                          std::move(a_copy_lds_window),
+                          std::move(a_lds_gemm_window));
+    }
+
+    template <typename BDramBlockWindowTmp, typename BLdsTensorView>
+    CK_TILE_DEVICE auto GetBWindows(const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                    const BLdsTensorView& b_lds_block_view) const
+    {
+        auto b_copy_dram_window =
+            make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                             b_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeBDramTileDistribution<Problem>());
+
+        // B LDS tile window for store
+        auto b_copy_lds_window =
+            make_tile_window(b_lds_block_view,
+                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             b_copy_dram_window.get_tile_distribution());
+
+        auto b_lds_gemm_window = make_tile_window(
+            b_lds_block_view, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+        return make_tuple(std::move(b_copy_dram_window),
+                          std::move(b_copy_lds_window),
+                          std::move(b_lds_gemm_window));
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
new file mode 100644
index 000000000..a72728b4a
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+
+namespace ck_tile {
+
+//  A Tile Window: global memory
+//  B Tile Window: global memory
+//  C Distributed tensor: register
+template <typename Problem>
+struct BaseGemmPipelineAgBgCrCompV3
+{
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+};
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy>
+struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
+{
+    using Base             = BaseGemmPipelineAgBgCrCompV3<Problem>;
+    using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
+
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
+    using I0        = number<0>;
+    using I1        = number<1>;
+    using I2        = number<2>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t VectorSizeA = Problem::VectorSizeA;
+    static constexpr index_t VectorSizeB = Problem::VectorSizeB;
+    static constexpr index_t VectorSizeC = Problem::VectorSizeC;
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    // Where is the right place for HasHotLoop and TailNum ???
+    static constexpr bool HasHotLoop = Problem::HasHotLoop;
+    static constexpr auto TailNum    = Problem::TailNum;
+    static constexpr auto Scheduler  = Problem::Scheduler;
+
+    using Base::PrefetchStages;
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <GemmPipelineScheduler Scheduler>
+    struct PipelineImpl : public PipelineImplBase
+    {
+    };
+
+    template <>
+    struct PipelineImpl<GemmPipelineScheduler::Intrawave> : public PipelineImplBase
+    {
+        using Base = PipelineImplBase;
+
+        CK_TILE_DEVICE static constexpr auto HotLoopScheduler()
+        {
+            constexpr index_t MPerXDL = BlockGemmShape::WarpTile::at(I0{});
+            constexpr index_t NPerXDL = BlockGemmShape::WarpTile::at(I1{});
+            constexpr index_t KPerXDL = BlockGemmShape::WarpTile::at(I2{});
+
+            constexpr index_t WaveSize = 64;
+            constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
+            constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
+
+            constexpr index_t A_LDS_Read_Width = KPerXDL;
+            constexpr index_t B_LDS_Read_Width = KPerXDL;
+
+            constexpr index_t A_Buffer_Load_Inst_Num =
+                MPerBlock * KPerBlock / (BlockSize * VectorSizeA);
+            constexpr index_t B_Buffer_Load_Inst_Num =
+                NPerBlock * KPerBlock / (BlockSize * VectorSizeB);
+
+            constexpr index_t A_LDS_Write_Inst_Num = MPerBlock * KPerBlock / (BlockSize * KPerXDL);
+            constexpr index_t B_LDS_Write_Inst_Num = NPerBlock * KPerBlock / (BlockSize * KPerXDL);
+
+            constexpr index_t A_LDS_Read_Inst_Num =
+                WaveNumN * MPerBlock * KPerBlock / (BlockSize * KPerXDL);
+            constexpr index_t B_LDS_Read_Inst_Num =
+                WaveNumM * MPerBlock * KPerBlock / (BlockSize * KPerXDL);
+
+            constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
+                                                (BlockSize / WaveSize) /
+                                                (MPerXDL * NPerXDL * KPerXDL);
+
+            // A/B split schedule
+            // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+            constexpr auto num_ds_read_inst_a = A_LDS_Read_Width * sizeof(ADataType) == 16
+                                                    ? A_LDS_Read_Inst_Num
+                                                    : A_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_b = B_LDS_Read_Width * sizeof(BDataType) == 16
+                                                    ? B_LDS_Read_Inst_Num
+                                                    : B_LDS_Read_Inst_Num / 2;
+
+            constexpr auto num_ds_write_inst_a = A_LDS_Write_Inst_Num;
+            constexpr auto num_ds_write_inst_b = B_LDS_Write_Inst_Num;
+
+            constexpr auto num_buffer_load_inst_a = A_Buffer_Load_Inst_Num;
+            constexpr auto num_buffer_load_inst_b = B_Buffer_Load_Inst_Num;
+
+            constexpr auto num_mfma_inst = C_MFMA_Inst_Num;
+
+            constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+            constexpr auto ds_read_a_issue_cycle =
+                A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+            constexpr auto ds_read_b_issue_cycle =
+                B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+            constexpr auto ds_read_a_mfma_rate =
+                (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+            constexpr auto ds_read_b_mfma_rate =
+                (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+            constexpr auto num_dsread_a_mfma =
+                (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+            constexpr auto num_dsread_b_mfma =
+                (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+            // stage 1
+            // Separate this part?
+            // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+            //                                               sizeof(ComputeDataType) /
+            //                                               sizeof(BDataType)
+            //                                           ? sizeof(ComputeDataType) /
+            //                                           sizeof(ADataType) : sizeof(ComputeDataType)
+            //                                           / sizeof(BDataType);
+            constexpr auto num_mfma_stage1 =
+                num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+            constexpr auto num_mfma_per_issue =
+                num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+            constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+            constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+                ignore = i;
+                static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                    ignore = idswrite;
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
+            });
+            static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+                ignore = i;
+                static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                    ignore = idswrite;
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
+            });
+
+            // stage 2
+            static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+                if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                             ds_read_a_mfma_rate)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                else
+                {
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x100,
+                        num_ds_read_inst_a - (num_dsread_a_mfma - 1) * ds_read_a_mfma_rate,
+                        0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+                if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                             ds_read_b_mfma_rate)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+                }
+                else
+                {
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x100,
+                        num_ds_read_inst_b - (num_dsread_b_mfma - 1) * ds_read_b_mfma_rate,
+                        0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+        }
+
+        template <bool HasHotLoop,
+                  TailNumber TailNum,
+                  typename ADramBlockWindowTmp,
+                  typename BDramBlockWindowTmp,
+                  typename AElementFunction,
+                  typename BElementFunction>
+        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                       const AElementFunction& a_element_func,
+                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BElementFunction& b_element_func,
+                                       index_t num_loop,
+                                       void* p_smem) const
+        {
+            static_assert(
+                std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                    std::is_same_v<BDataType,
+                                   remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+                "A/B Dram block window should have the same data type as appropriate "
+                "([A|B]DataType) defined in Problem definition!");
+
+            static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                              NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                              KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}],
+                          "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock"
+                          " or KPerBlock!");
+
+            // ------------------------------------------------------------------------------------
+            // Definitions of all needed tiles
+
+            // A/B tiles in LDS
+            auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem);
+
+            // A DRAM tile window for load
+            // A LDS tile window for store
+            // A LDS tile for block GEMM
+            auto&& [a_copy_dram_window, a_copy_lds_window, a_lds_gemm_window] =
+                Base::GetAWindows(a_dram_block_window_tmp, a_lds_block);
+
+            // B DRAM tile window for load
+            // B LDS tile window for store
+            // B LDS tile for block GEMM
+            auto&& [b_copy_dram_window, b_copy_lds_window, b_lds_gemm_window] =
+                Base::GetBWindows(b_dram_block_window_tmp, b_lds_block);
+
+            // Block GEMM
+            auto block_gemm   = BlockGemm();
+            auto c_block_tile = block_gemm.MakeCBlockTile();
+
+            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
+            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+
+            using ABlockTile =
+                decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+            using BBlockTile =
+                decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+
+            ABlockTile a_block_tile;
+            BBlockTile b_block_tile;
+
+            // -----------------------------------------------------------------------------------------
+            // Gemm pipeline start
+
+            // prefetch
+            // global read 0
+            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window);
+            Base::GlobalPrefetch(b_block_tile, b_copy_dram_window);
+
+            // initialize C
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            // LDS write 0
+            Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+            Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+
+            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window);
+            Base::GlobalPrefetch(b_block_tile, b_copy_dram_window);
+
+            block_sync_lds();
+            block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            // main body
+            if constexpr(HasHotLoop)
+            {
+                index_t i = 0;
+                do
+                {
+                    block_sync_lds();
+
+                    Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+                    Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+
+                    Base::GlobalPrefetch(a_block_tile, a_copy_dram_window);
+                    Base::GlobalPrefetch(b_block_tile, b_copy_dram_window);
+
+                    block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+
+                    block_sync_lds();
+                    block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+
+                    i += 1;
+                } while(i < (num_loop - 1));
+            }
+            // tail
+            if constexpr(TailNum == TailNumber::Full)
+            {
+                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+            }
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+            return c_block_tile;
+        }
+    };
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
+            a_dram_block_window_tmp,
+            a_element_func,
+            b_dram_block_window_tmp,
+            b_element_func,
+            num_loop,
+            p_smem);
+    }
+
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_dram_block_window_tmp,
+            [](const BDataType& b) { return b; },
+            num_loop,
+            p_smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index 847c5b187..e2e94cf92 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 
 namespace ck_tile {
 
@@ -90,7 +91,8 @@ struct BaseGemmPipelineAgBgCrMem
 template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy>
 struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 {
-    using Base = BaseGemmPipelineAgBgCrMem<Problem>;
+    using Base             = BaseGemmPipelineAgBgCrMem<Problem>;
+    using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
 
     using ADataType      = remove_cvref_t<typename Problem::ADataType>;
     using BDataType      = remove_cvref_t<typename Problem::BDataType>;
@@ -103,8 +105,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
     using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
     using I0        = number<0>;
+    using I1        = number<1>;
+    using I2        = number<2>;
 
-    static constexpr index_t BlockSize = Problem::kBlockSize;
     static constexpr index_t MPerBlock = BlockGemmShape::kM;
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
@@ -124,46 +127,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
     using Base::PrefetchStages;
 
-    CK_TILE_HOST_DEVICE constexpr index_t GetStaticLdsSize()
-    {
-        return integer_divide_ceil(
-                   sizeof(ADataType) *
-                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
-                   16) *
-                   16 +
-               sizeof(BDataType) *
-                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
-    }
-
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         return Policy::template GetSmemSize<Problem>();
     }
 
     template <GemmPipelineScheduler Scheduler>
-    struct PipelineImpl
+    struct PipelineImpl : public PipelineImplBase
     {
     };
 
     template <>
-    struct PipelineImpl<GemmPipelineScheduler::Intrawave>
+    struct PipelineImpl<GemmPipelineScheduler::Intrawave> : public PipelineImplBase
     {
-        template <typename DstBlockTile, typename SrcTileWindow>
-        CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile,
-                                           SrcTileWindow& dram_tile_window) const
-        {
-            load_tile(dst_block_tile, dram_tile_window);
-            move_tile_window(dram_tile_window, {0, KPerBlock});
-        }
-
-        template <typename DstTileWindow, typename SrcBlockTile, typename ElementFunction>
-        CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
-                                         const SrcBlockTile& src_block_tile,
-                                         const ElementFunction& element_func) const
-        {
-            const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
-            store_tile(lds_tile_window, block_tile_tmp);
-        }
+        using Base = PipelineImplBase;
 
         template <bool HasHotLoop,
                   TailNumber TailNum,
@@ -185,66 +162,38 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 "A/B Dram block window should have the same data type as appropriate "
                 "([A|B]DataType) defined in Problem definition!");
 
-            static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
-                              NPerBlock ==
-                                  BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
-                              KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+            static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                              NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                              KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}],
                           "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock"
                           " or KPerBlock!");
 
             // ------------------------------------------------------------------------------------
             // Definitions of all needed tiles
 
-            // A tile in LDS
-            ADataType* p_a_lds              = static_cast<ADataType*>(p_smem);
-            constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
-            auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
-
-            // TODO: LDS alignment should come from Policy!
-            constexpr index_t a_lds_block_space_size_aligned =
-                integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(),
-                                    16) *
-                16;
-
-            // B tile in LDS
-            BDataType* p_b_lds = static_cast<BDataType*>(
-                static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
-            constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
-            auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+            // A/B tiles in LDS
+            // With c++20 could simplify to below line.
+            // Currently get error: captured structured bindings are a C++20 extension
+            // auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem);
+            auto ab_lds_blocks = Base::GetABLdsTensorViews(p_smem);
+            auto& a_lds_block  = ab_lds_blocks.at(I0{});
+            auto& b_lds_block  = ab_lds_blocks.at(I1{});
 
             // A DRAM tile window for load
-            auto a_copy_dram_window =
-                make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
-                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                 a_dram_block_window_tmp.get_window_origin(),
-                                 Policy::template MakeADramTileDistribution<Problem>());
-
             // A LDS tile window for store
-            auto a_copy_lds_window =
-                make_tile_window(a_lds_block,
-                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 a_copy_dram_window.get_tile_distribution());
-            // B DRAM tile window for load
-            auto b_copy_dram_window =
-                make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
-                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                 b_dram_block_window_tmp.get_window_origin(),
-                                 Policy::template MakeBDramTileDistribution<Problem>());
+            // A LDS tile for block GEMM
+            auto a_windows           = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block);
+            auto& a_copy_dram_window = a_windows.at(I0{});
+            auto& a_copy_lds_window  = a_windows.at(I1{});
+            auto& a_lds_gemm_window  = a_windows.at(I2{});
 
+            // B DRAM tile window for load
             // B LDS tile window for store
-            auto b_copy_lds_window =
-                make_tile_window(b_lds_block,
-                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 b_copy_dram_window.get_tile_distribution());
-
-            // A LDS tile for block GEMM
-            auto a_lds_gemm_window = make_tile_window(
-                a_lds_block, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
             // B LDS tile for block GEMM
-            auto b_lds_gemm_window = make_tile_window(
-                b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+            auto b_windows           = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block);
+            auto& b_copy_dram_window = b_windows.at(I0{});
+            auto& b_copy_lds_window  = b_windows.at(I1{});
+            auto& b_lds_gemm_window  = b_windows.at(I2{});
 
             // Block GEMM
             auto block_gemm   = BlockGemm();
@@ -266,20 +215,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
             // prefetch
             // global read 0
-            GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window);
-            GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window);
+            Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window);
+            Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window);
 
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
-            LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
+            Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
+            Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
 
             // Global prefetch [1, PrefetchStages]
             static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) {
-                GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}), a_copy_dram_window);
-                GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}), b_copy_dram_window);
+                Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}), a_copy_dram_window);
+                Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}), b_copy_dram_window);
             });
 
             // main body
@@ -295,19 +244,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
                         block_sync_lds();
 
-                        LocalPrefill(
+                        Base::LocalPrefill(
                             a_copy_lds_window,
                             a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
                             a_element_func);
-                        LocalPrefill(
+                        Base::LocalPrefill(
                             b_copy_lds_window,
                             b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
                             b_element_func);
 
-                        GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
-                                       a_copy_dram_window);
-                        GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
-                                       b_copy_dram_window);
+                        Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
+                                             a_copy_dram_window);
+                        Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
+                                             b_copy_dram_window);
                     });
 
                     i += PrefetchStages;
@@ -323,12 +272,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
                     block_sync_lds();
 
-                    LocalPrefill(a_copy_lds_window,
-                                 a_block_tiles.get(number<prefetch_idx>{}),
-                                 a_element_func);
-                    LocalPrefill(b_copy_lds_window,
-                                 b_block_tiles.get(number<prefetch_idx>{}),
-                                 b_element_func);
+                    Base::LocalPrefill(a_copy_lds_window,
+                                       a_block_tiles.get(number<prefetch_idx>{}),
+                                       a_element_func);
+                    Base::LocalPrefill(b_copy_lds_window,
+                                       b_block_tiles.get(number<prefetch_idx>{}),
+                                       b_element_func);
                 });
 
                 block_sync_lds();
@@ -376,24 +325,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     };
 
     template <>
-    struct PipelineImpl<GemmPipelineScheduler::Interwave>
+    struct PipelineImpl<GemmPipelineScheduler::Interwave> : public PipelineImplBase
     {
-        template <typename DstBlockTile, typename SrcTileWindow>
-        CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile,
-                                           SrcTileWindow& dram_tile_window) const
-        {
-            load_tile(dst_block_tile, dram_tile_window);
-            move_tile_window(dram_tile_window, {0, KPerBlock});
-        }
-
-        template <typename DstTileWindow, typename SrcBlockTile, typename ElementFunction>
-        CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
-                                         const SrcBlockTile& src_block_tile,
-                                         const ElementFunction& element_func) const
-        {
-            const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
-            store_tile(lds_tile_window, block_tile_tmp);
-        }
+        using Base = PipelineImplBase;
 
         template <bool HasHotLoop,
                   TailNumber TailNum,
@@ -415,66 +349,38 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 "A/B Dram block window should have the same data type as appropriate "
                 "([A|B]DataType) defined in Problem definition!");
 
-            static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
-                              NPerBlock ==
-                                  BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
-                              KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+            static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                              NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                              KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}],
                           "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock"
                           " or KPerBlock!");
 
             // ------------------------------------------------------------------------------------
             // Definitions of all needed tiles
 
-            // A tile in LDS
-            ADataType* p_a_lds              = static_cast<ADataType*>(p_smem);
-            constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
-            auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
-
-            // TODO: LDS alignment should come from Policy!
-            constexpr index_t a_lds_block_space_size_aligned =
-                integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(),
-                                    16) *
-                16;
-
-            // B tile in LDS
-            BDataType* p_b_lds = static_cast<BDataType*>(
-                static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
-            constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
-            auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+            // A/B tiles in LDS
+            // With c++20 could simplify to below line.
+            // Currently get error: captured structured bindings are a C++20 extension
+            // auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem);
+            auto ab_lds_blocks = Base::GetABLdsTensorViews(p_smem);
+            auto& a_lds_block  = ab_lds_blocks.at(I0{});
+            auto& b_lds_block  = ab_lds_blocks.at(I1{});
 
             // A DRAM tile window for load
-            auto a_copy_dram_window =
-                make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
-                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                 a_dram_block_window_tmp.get_window_origin(),
-                                 Policy::template MakeADramTileDistribution<Problem>());
-
             // A LDS tile window for store
-            auto a_copy_lds_window =
-                make_tile_window(a_lds_block,
-                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 a_copy_dram_window.get_tile_distribution());
-            // B DRAM tile window for load
-            auto b_copy_dram_window =
-                make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
-                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                 b_dram_block_window_tmp.get_window_origin(),
-                                 Policy::template MakeBDramTileDistribution<Problem>());
+            // A LDS tile for block GEMM
+            auto a_windows           = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block);
+            auto& a_copy_dram_window = a_windows.at(I0{});
+            auto& a_copy_lds_window  = a_windows.at(I1{});
+            auto& a_lds_gemm_window  = a_windows.at(I2{});
 
+            // B DRAM tile window for load
             // B LDS tile window for store
-            auto b_copy_lds_window =
-                make_tile_window(b_lds_block,
-                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 b_copy_dram_window.get_tile_distribution());
-
-            // A LDS tile for block GEMM
-            auto a_lds_gemm_window = make_tile_window(
-                a_lds_block, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
             // B LDS tile for block GEMM
-            auto b_lds_gemm_window = make_tile_window(
-                b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+            auto b_windows           = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block);
+            auto& b_copy_dram_window = b_windows.at(I0{});
+            auto& b_copy_lds_window  = b_windows.at(I1{});
+            auto& b_lds_gemm_window  = b_windows.at(I2{});
 
             // Block GEMM
             auto block_gemm   = BlockGemm();
@@ -496,20 +402,20 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
             // prefetch
             // global read 0
-            GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window);
-            GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window);
+            Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window);
+            Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window);
 
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
-            LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
+            Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
+            Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func);
 
             // Global prefetch [1, PrefetchStages]
             static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) {
-                GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}), a_copy_dram_window);
-                GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}), b_copy_dram_window);
+                Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}), a_copy_dram_window);
+                Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}), b_copy_dram_window);
             });
 
             // main body
@@ -523,19 +429,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                         block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
                         // no second block_sync_lds because it's interwave
 
-                        LocalPrefill(
+                        Base::LocalPrefill(
                             a_copy_lds_window,
                             a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
                             a_element_func);
-                        LocalPrefill(
+                        Base::LocalPrefill(
                             b_copy_lds_window,
                             b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
                             b_element_func);
 
-                        GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
-                                       a_copy_dram_window);
-                        GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
-                                       b_copy_dram_window);
+                        Base::GlobalPrefetch(a_block_tiles.get(number<prefetch_idx>{}),
+                                             a_copy_dram_window);
+                        Base::GlobalPrefetch(b_block_tiles.get(number<prefetch_idx>{}),
+                                             b_copy_dram_window);
                     });
 
                     i += PrefetchStages;
@@ -548,12 +454,12 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                     block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
                     // no second block_sync_lds because it's interwave
 
-                    LocalPrefill(a_copy_lds_window,
-                                 a_block_tiles.get(number<prefetch_idx>{}),
-                                 a_element_func);
-                    LocalPrefill(b_copy_lds_window,
-                                 b_block_tiles.get(number<prefetch_idx>{}),
-                                 b_element_func);
+                    Base::LocalPrefill(a_copy_lds_window,
+                                       a_block_tiles.get(number<prefetch_idx>{}),
+                                       a_element_func);
+                    Base::LocalPrefill(b_copy_lds_window,
+                                       b_block_tiles.get(number<prefetch_idx>{}),
+                                       b_element_func);
                 });
 
                 block_sync_lds();
-- 
GitLab


From aa6e2087f550be335e7b14893ee615303eec3faa Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 28 Nov 2024 10:42:19 -0800
Subject: [PATCH 089/153] Reduce docker size and build time in CI. (#1699)

* refactor docker build in CI

* add Dockerfile.compiler

* add input args to Dockerfile.compiler

* rearrange the docker args
---
 Dockerfile          |  4 ----
 Dockerfile.compiler | 26 ++++++++++++++++++++++++++
 Jenkinsfile         | 45 +++++++++++++++++++++++++--------------------
 3 files changed, 51 insertions(+), 24 deletions(-)
 create mode 100644 Dockerfile.compiler

diff --git a/Dockerfile b/Dockerfile
index 38a563ce3..f9b7d76e3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -77,10 +77,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
 # Remove unnecessary rocm components that take a lot of space
     apt-get remove -y rocblas rocfft rocsparse composablekernel-dev
 
-# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
-RUN if [ "$ROCMVERSION" = "6.1" ]; then \
-        sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \
-    fi
 # Update the cmake to version 3.27.5
 RUN pip install --upgrade cmake==3.27.5 && \
 #Install latest ccache
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
new file mode 100644
index 000000000..354b71f69
--- /dev/null
+++ b/Dockerfile.compiler
@@ -0,0 +1,26 @@
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2"
+FROM $BASE_DOCKER
+ARG compiler_version=""
+ARG compiler_commit=""
+
+# Add alternative compilers, if necessary
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'" && \
+    sh -c "echo compiler commit = '$compiler_commit'"
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+        cd llvm-project && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+        make -j 16 ; \
+    else echo "using the release compiler"; \
+    fi
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+        make -j 16 ; \
+    else echo "using the release compiler"; \
+    fi
diff --git a/Jenkinsfile b/Jenkinsfile
index b448a5130..f8493fa2f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -32,41 +32,42 @@ def runShell(String command){
     return (output != "")
 }
 
-def getDockerImageName(){
+def getBaseDockerImageName(){
     def img
     if (params.USE_CUSTOM_DOCKER != ""){
         img = "${params.USE_CUSTOM_DOCKER}"
     }
     else{
     if (params.ROCMVERSION != "6.3"){
-       if (params.COMPILER_VERSION == "") {
-           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
-       }
-       else{
-          if (params.COMPILER_COMMIT == ""){
-             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
-          }
-          else{
-             def commit = "${params.COMPILER_COMMIT}"[0..6]
-             img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
-          }
-       }
+        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
+        }
+    else{
+        img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+        }
+    }
+    return img
+}
+
+def getDockerImageName(){
+    def img
+    def base_name = getBaseDockerImageName()
+    if (params.USE_CUSTOM_DOCKER != ""){
+        img = "${params.USE_CUSTOM_DOCKER}"
     }
     else{
        if (params.COMPILER_VERSION == "") {
-           img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+           img = "${base_name}"
        }
        else{
           if (params.COMPILER_COMMIT == ""){
-             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+             img = "${base_name}_${params.COMPILER_VERSION}"
           }
           else{
              def commit = "${params.COMPILER_COMMIT}"[0..6]
-             img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+             img = "${base_name}_${params.COMPILER_VERSION}_${commit}"
           }
        }
     }
-    }
     return img
 }
 
@@ -131,17 +132,21 @@ def buildDocker(install_prefix){
     env.DOCKER_BUILDKIT=1
     checkout scm
     def image_name = getDockerImageName()
+    def base_image_name = getBaseDockerImageName()
     echo "Building Docker for ${image_name}"
-    def dockerArgs = "--squash --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
+    def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
     if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-        dockerArgs = dockerArgs + " --no-cache "
+        dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . "
+    }
+    else{
+        dockerArgs = dockerArgs + " -f Dockerfile . "
     }
     echo "Build Args: ${dockerArgs}"
     try{
         if(params.BUILD_DOCKER){
             //force building the new docker if that parameter is true
             echo "Building image: ${image_name}"
-            retimage = docker.build("${image_name}", dockerArgs + ' .')
+            retimage = docker.build("${image_name}", dockerArgs)
             withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) {
                 retimage.push()
             }
-- 
GitLab


From bb652696e765fe178404bd38a071d6d6b829bccb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 28 Nov 2024 10:43:36 -0800
Subject: [PATCH 090/153] Bump rocm-docs-core from 1.9.0 to 1.9.1 in
 /docs/sphinx (#1701)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.0 to 1.9.1.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.0...v1.9.1)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 5bec504a0..79c74cd7f 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.9.0
+rocm-docs-core==1.9.1
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 8881c0e74..426073037 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.9.0
+rocm-docs-core==1.9.1
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From 78f0fea08eafa7e3da49cbb3d77c962cecb3ae0b Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Fri, 29 Nov 2024 11:52:18 +0100
Subject: [PATCH 091/153] Ck tile batched gemm example (#1615)

* [CK Tile] Batched GEMM Example

* [CK Tile] Batched GEMM Example - minor refactor

* [CK Tile] Batched GEMM Example - README update

* [CK Tile] Batched Gemm Example - review changes

- Added tensor data layours as input parameters
- Changed structure of Host and Kernel args
- Removed bug with invalid vector read on non-contiguous memory

* [CK Tile] Batched Gemm Example - remove comment

* [CK Tile] Batched Gemm Example - Add GTests part1

* [CK Tile] Batched Gemm Example - GTests part2 + review changes

* [CK TILE] Batched GEMM post merge fixes

* [CK Tile] Batched GEMM Example - fix pad views
---
 .../ck_tile/16_batched_gemm/CMakeLists.txt    |   1 +
 example/ck_tile/16_batched_gemm/README.md     |  37 +++
 .../ck_tile/16_batched_gemm/batched_gemm.cpp  | 103 +++++++
 .../ck_tile/16_batched_gemm/batched_gemm.hpp  |  63 +++++
 .../run_batched_gemm_example.inc              | 253 +++++++++++++++++
 example/ck_tile/CMakeLists.txt                |   2 +-
 .../ck_tile/host/reference/reference_gemm.hpp | 112 ++++++++
 include/ck_tile/ops/gemm.hpp                  |   1 +
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   | 258 ++++++++++++++++++
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     |   2 +-
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/batched_gemm/CMakeLists.txt      |   4 +
 .../batched_gemm/test_batched_gemm.cpp        |  29 ++
 .../test_batched_gemm_ut_cases.inc            |   9 +
 .../batched_gemm/test_batched_gemm_util.hpp   | 225 +++++++++++++++
 15 files changed, 1098 insertions(+), 2 deletions(-)
 create mode 100644 example/ck_tile/16_batched_gemm/CMakeLists.txt
 create mode 100644 example/ck_tile/16_batched_gemm/README.md
 create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.cpp
 create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.hpp
 create mode 100644 example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
 create mode 100644 include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
 create mode 100644 test/ck_tile/batched_gemm/CMakeLists.txt
 create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm.cpp
 create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
 create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_util.hpp

diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt
new file mode 100644
index 000000000..78e78c6b0
--- /dev/null
+++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp)
diff --git a/example/ck_tile/16_batched_gemm/README.md b/example/ck_tile/16_batched_gemm/README.md
new file mode 100644
index 000000000..34b56db52
--- /dev/null
+++ b/example/ck_tile/16_batched_gemm/README.md
@@ -0,0 +1,37 @@
+# Batched GEMM
+
+This folder contains example for batched GEMM using ck_tile tile-programming implementation.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+make tile_example_batched_gemm -j
+```
+This will result in an executable `build/bin/tile_example_batched_gemm`
+
+## example
+```
+args:
+              -m     m dimension (default:256)
+              -n     n dimension (default:128)
+              -k     k dimension (default:128)
+       -a_layout     A tensor data layout (default:R) (R for Row, C for Col)
+       -b_layout     B tensor data layout (default:R) (R for Row, C for Col)
+       -c_layout     C tensor data layout (default:R) (R for Row, C for Col)
+       -stride_a     Tensor A stride (default:128)
+       -stride_b     Tensor B stride (default:128)
+       -stride_c     Tensor C stride (default:128)
+ -batch_stride_a     Batch A stride (default:32768)
+ -batch_stride_b     Batch B stride (default:16384)
+ -batch_stride_c     Batch C stride (default:32768)
+    -batch_count     Batch count (default:16)
+              -v     0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+              -e     Absolute error tolerance (default:1e-5)
+           -prec     data type. fp16/bf16/fp8/bf8 (default:fp16)
+         -warmup     number of iterations before benchmark the kernel (default:10)
+         -repeat     number of iterations to benchmark the kernel (default:100)
+          -timer     gpu:gpu timer, cpu:cpu timer (default:gpu)
+```
\ No newline at end of file
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
new file mode 100644
index 000000000..bfdd74126
--- /dev/null
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "batched_gemm.hpp"
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s)
+{
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadM        = false;
+    constexpr bool kPadN        = false;
+    constexpr bool kPadK        = false;
+    constexpr bool kTilePermute = false;
+    // The rank and permutation will also be generate out by the CodeGen part.
+    constexpr ck_tile::index_t kOutputRank = 2;
+
+    constexpr int kBlockPerCu = 1;
+
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    // Whether doing the CShuffle (transpose before the global memory), depending on the output
+    // layout.
+    constexpr bool CShuffleEpilogue =
+        std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTilePartitioner<CodegenGemmShape>;
+
+    using GemmEpilogue = std::conditional_t<
+        CShuffleEpilogue,
+        ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                                                   CDataType,
+                                                                   kPadM,
+                                                                   kPadN,
+                                                                   kTilePermute,
+                                                                   kOutputRank,
+                                                                   1,
+                                                                   0,
+                                                                   TilePartitioner::kM,
+                                                                   TilePartitioner::kN>>,
+        ck_tile::Default2DEpilogue<
+            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+    using CodegenPipelineProblem = ck_tile::
+        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
+
+    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+    using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+
+    auto kargs = Kernel::MakeKargs(args);
+
+    const dim3 grids      = Kernel::GridSize(args);
+    constexpr dim3 blocks = Kernel::BlockSize();
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Launching kernel with args:"
+                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
+    }
+
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+    return ave_time;
+}
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
new file mode 100644
index 000000000..e252c0f67
--- /dev/null
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
+
+template <typename DataType>
+struct BatchedGemmTypeConfig;
+
+template <>
+struct BatchedGemmTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+using Types = BatchedGemmTypeConfig<ck_tile::half_t>;
+
+// Specific type aliases for easy access
+using ADataType   = Types::ADataType;
+using BDataType   = Types::BDataType;
+using AccDataType = Types::AccDataType;
+using CDataType   = Types::CDataType;
+
+struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs
+{
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "256", "m dimension")
+        .insert("n", "128", "n dimension")
+        .insert("k", "128", "k dimension")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "R", "B tensor data layout - Row by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("batch_stride_a", "32768", "Batch A stride")
+        .insert("batch_stride_b", "16384", "Batch B stride")
+        .insert("batch_stride_c", "32768", "Batch C stride")
+        .insert("batch_count", "16", "Batch count")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float batched_gemm(batched_gemm_kargs args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
new file mode 100644
index 000000000..dacca2042
--- /dev/null
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                          ck_tile::DeviceMem& b_k_n_dev_buf,
+                          ck_tile::DeviceMem& c_m_n_dev_buf,
+                          ck_tile::index_t M,
+                          ck_tile::index_t N,
+                          ck_tile::index_t K,
+                          ck_tile::index_t stride_A,
+                          ck_tile::index_t stride_B,
+                          ck_tile::index_t stride_C,
+                          ck_tile::index_t batch_stride_A,
+                          ck_tile::index_t batch_stride_B,
+                          ck_tile::index_t batch_stride_C,
+                          ck_tile::index_t batch_count,
+                          int n_warmup,
+                          int n_repeat)
+{
+    batched_gemm_kargs args;
+    args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
+    args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
+    args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
+    args.M              = M;
+    args.N              = N;
+    args.K              = K;
+    args.stride_A       = stride_A;
+    args.stride_B       = stride_B;
+    args.stride_C       = stride_C;
+    args.batch_stride_A = batch_stride_A;
+    args.batch_stride_B = batch_stride_B;
+    args.batch_stride_C = batch_stride_C;
+    args.batch_count    = batch_count;
+
+    float ave_time = batched_gemm<ALayout, BLayout, CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::string op_name{"Batched Gemm"};
+    std::size_t flop     = std::size_t(2) * batch_count * M * N * K;
+    std::size_t num_byte = sizeof(ADataType) * batch_count * M * K +
+                           sizeof(BDataType) * batch_count * N * K +
+                           sizeof(CDataType) * batch_count * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " batch_stride_A =" << batch_stride_A << " batch_stride_B =" << batch_stride_B
+              << " batch_stride_C =" << batch_stride_C << " batch_count =" << batch_count << " : "
+              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+int run_batched_gemm_example_with_layouts(int argc,
+                                          char* argv[],
+                                          const ALayout a_layout                  = ALayout{},
+                                          const BLayout b_layout                  = BLayout{},
+                                          [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t batch_stride_A = arg_parser.get_int("batch_stride_a");
+    ck_tile::index_t batch_stride_B = arg_parser.get_int("batch_stride_b");
+    ck_tile::index_t batch_stride_C = arg_parser.get_int("batch_stride_c");
+    ck_tile::index_t batch_count    = arg_parser.get_int("batch_count");
+
+    int n_warmup = arg_parser.get_int("warmup");
+    int n_repeat = arg_parser.get_int("repeat");
+
+    using namespace ck_tile::literals;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return ck_tile::HostTensorDescriptor({batch_count_, row, col},
+                                                 {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return ck_tile::HostTensorDescriptor({batch_count_, row, col},
+                                                 {batch_stride, 1_uz, stride});
+        }
+    };
+
+    auto f_get_default_stride = [](std::size_t row,
+                                   std::size_t col,
+                                   std::size_t stride,
+                                   auto layout) {
+        if(stride == 0)
+        {
+            // give a chance if stride is zero, return a default packed stride
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return col;
+            }
+            else
+            {
+                return row;
+            }
+        }
+        else
+            return stride;
+    };
+
+    stride_A = f_get_default_stride(M, K, stride_A, a_layout);
+    stride_B = f_get_default_stride(K, N, stride_B, b_layout);
+    stride_C = f_get_default_stride(M, N, stride_C, c_layout);
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, a_layout));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, b_layout));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, c_layout));
+
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    b_k_n_dev_buf.ToDevice(b_k_n.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    invoke_batched_gemm<ALayout, BLayout, CLayout>(a_m_k_dev_buf,
+                                                   b_k_n_dev_buf,
+                                                   c_m_n_dev_buf,
+                                                   M,
+                                                   N,
+                                                   K,
+                                                   stride_A,
+                                                   stride_B,
+                                                   stride_C,
+                                                   batch_stride_A,
+                                                   batch_stride_B,
+                                                   batch_stride_C,
+                                                   batch_count,
+                                                   n_warmup,
+                                                   n_repeat);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{}));
+        c_m_n_host_ref.SetZero();
+
+        const auto b_n_k = b_k_n.transpose({0, 2, 1});
+
+        ck_tile::reference_batched_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_n_k, c_m_n_host_ref);
+
+        pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref);
+
+        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{}));
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ck_tile::reference_batched_gemm_gpu<ADataType,
+                                            BDataType,
+                                            AccDataType,
+                                            CDataType,
+                                            ALayout,
+                                            BLayout,
+                                            CLayout>(a_m_k_dev_buf,
+                                                     b_k_n_dev_buf,
+                                                     c_m_n_gpu_buf_ref,
+                                                     M,
+                                                     N,
+                                                     K,
+                                                     stride_A,
+                                                     stride_B,
+                                                     stride_C,
+                                                     batch_stride_A,
+                                                     batch_stride_B,
+                                                     batch_stride_C,
+                                                     batch_count);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+        pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref);
+
+        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+int run_batched_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    std::string a_layout = arg_parser.get_str("a_layout");
+    std::string b_layout = arg_parser.get_str("b_layout");
+
+    if(a_layout == "R" && b_layout == "R")
+    {
+        return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "C")
+    {
+        return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+    }
+    // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not
+    // work else if(a_layout == "C" && b_layout == "C")
+    // {
+    //     return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
+    // }
+    // else if(a_layout == "C" && b_layout == "R")
+    // {
+    //     return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{});
+    // }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 29305405b..51ebb5bf0 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -15,4 +15,4 @@ add_subdirectory(12_smoothquant)
 add_subdirectory(13_moe_sorting)
 add_subdirectory(14_moe_smoothquant)
 add_subdirectory(15_fused_moe)
-
+add_subdirectory(16_batched_gemm)
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index dbdef0e9c..8bd1f5b04 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -183,4 +183,116 @@ void reference_gemm_gpu(DeviceMem& a_device,
 
     return;
 }
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC>
+void reference_batched_gemm_gpu(DeviceMem& a_device,
+                                DeviceMem& b_device,
+                                DeviceMem& c_device,
+                                index_t M,
+                                index_t N,
+                                index_t K,
+                                index_t stride_a,
+                                index_t stride_b,
+                                index_t stride_c,
+                                index_t batch_stride_A,
+                                index_t batch_stride_B,
+                                index_t batch_stride_C,
+                                index_t batch_count)
+{
+
+    ADataType* d_A;
+    BDataType* d_B;
+    CDataType* d_C;
+
+    hipError_t errA = hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType));
+    hipError_t errB = hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType));
+    hipError_t errC = hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType));
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA)
+                  << std::endl;
+        return; // Early exit on error
+    }
+
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB)
+                  << std::endl;
+        return; // Early exit on error
+    }
+
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC)
+                  << std::endl;
+        return; // Early exit on error
+    }
+
+    errA = hipMemcpy(d_A,
+                     a_device.GetDeviceBuffer(),
+                     batch_count * M * K * sizeof(ADataType),
+                     hipMemcpyHostToDevice);
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl;
+    }
+
+    errB = hipMemcpy(d_B,
+                     b_device.GetDeviceBuffer(),
+                     batch_count * N * K * sizeof(BDataType),
+                     hipMemcpyHostToDevice);
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl;
+    }
+
+    int totalElements      = M * N;
+    int numThreadsPerBlock = 256; // Common choice for threads per block
+    int numBlocks          = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
+
+    for(index_t batch_id = 0; batch_id < batch_count; ++batch_id)
+    {
+        ADataType* d_ATemp = d_A + batch_id * batch_stride_A;
+        BDataType* d_BTemp = d_B + batch_id * batch_stride_B;
+        CDataType* d_CTemp = d_C + batch_id * batch_stride_C;
+        naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
+            <<<numBlocks, numThreadsPerBlock>>>(
+                d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c);
+    }
+
+    errC = hipMemcpy(c_device.GetDeviceBuffer(),
+                     d_C,
+                     batch_count * M * N * sizeof(CDataType),
+                     hipMemcpyDeviceToHost);
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl;
+    }
+
+    errA = hipFree(d_A);
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl;
+    }
+
+    errB = hipFree(d_B);
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl;
+    }
+
+    errC = hipFree(d_C);
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl;
+    }
+
+    return;
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 1340fb204..b9eb24858 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -25,6 +25,7 @@
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
new file mode 100644
index 000000000..07b4af573
--- /dev/null
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+struct BatchedGemmHostArgs
+{
+    const void* a_ptr;
+    const void* b_ptr;
+    void* c_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+    index_t batch_stride_A;
+    index_t batch_stride_B;
+    index_t batch_stride_C;
+    index_t batch_count;
+};
+
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct BatchedGemmKernel
+{
+    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    struct BatchedGemmKargs
+    {
+        const void* a_ptr;
+        const void* b_ptr;
+        void* c_ptr;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t stride_A;
+        index_t stride_B;
+        index_t stride_C;
+        index_t batch_stride_A;
+        index_t batch_stride_B;
+        index_t batch_stride_C;
+        index_t batch_count;
+    };
+
+    using Kargs = BatchedGemmKargs;
+    using Hargs = BatchedGemmHostArgs;
+
+    __host__ static constexpr auto GridSize(const Hargs& h)
+    {
+        return TilePartitioner::GridSize(h.M, h.N, h.batch_count);
+    }
+
+    __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    CK_TILE_HOST static constexpr BatchedGemmKargs MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.a_ptr          = h.a_ptr;
+        k.b_ptr          = h.b_ptr;
+        k.c_ptr          = h.c_ptr;
+        k.M              = h.M;
+        k.N              = h.N;
+        k.K              = h.K;
+        k.stride_A       = h.stride_A;
+        k.stride_B       = h.stride_B;
+        k.stride_C       = h.stride_C;
+        k.batch_stride_A = h.batch_stride_A;
+        k.batch_stride_B = h.batch_stride_B;
+        k.batch_stride_C = h.batch_stride_C;
+        k.batch_count    = h.batch_count;
+        return k;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const auto [i_m, i_n] = TilePartitioner{}();
+        const auto i_batch    = __builtin_amdgcn_readfirstlane(blockIdx.z);
+
+        //  options
+        const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
+        const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
+        const ADataType* a_start  = static_cast<const ADataType*>(kargs.a_ptr);
+
+        const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
+        const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
+        const BDataType* b_start  = static_cast<const BDataType*>(kargs.b_ptr);
+
+        // Convert pointers to tensor views
+        auto a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_start + batch_offset_A,
+                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::VectorSizeA>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_start + batch_offset_A,
+                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(1, kargs.stride_A),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        auto b_tensor_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_start + batch_offset_B,
+                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(1, kargs.stride_B),
+                    number<1>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_start + batch_offset_B,
+                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(kargs.stride_B, 1),
+                    number<GemmPipeline::VectorSizeB>{},
+                    number<1>{});
+            }
+        }();
+
+        auto a_pad_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(
+                    a_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+                    sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    a_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+                    sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        // clang-format on
+
+        auto a_block_window = make_tile_window(
+            a_pad_view,
+            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+            {i_m, 0});
+
+        auto b_pad_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return pad_tensor_view(
+                    b_tensor_view,
+                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+                    sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    b_tensor_view,
+                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+                    sequence<GemmPipeline::kPadN, false>{});
+            }
+        }();
+        // clang-format on
+
+        auto b_block_window = make_tile_window(
+            b_pad_view,
+            make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+            {i_n, 0});
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
+
+        // Run GEMM cooperatively by whole wokrgroup.
+        auto c_block_tile =
+            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
+
+        const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C);
+        const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C);
+        CDataType* c_start        = static_cast<CDataType*>(kargs.c_ptr);
+        auto c_tensor_view        = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_start + batch_offset_C,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<GemmPipeline::VectorSizeC>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_start + batch_offset_C,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        auto c_pad_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(
+                    c_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+                    sequence<false, GemmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    c_tensor_view,
+                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+                    sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
+            {i_m, i_n});
+
+        EpiloguePipeline{}(c_block_window, c_block_tile);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index c0817e736..822748c69 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -124,7 +124,7 @@ struct GemmPipelineAGmemBGmemCRegV1
             b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
 
         // Block GEMM
-        constexpr auto block_gemm = Policy::template GetBlockGemm<Problem>();
+        auto block_gemm = Policy::template GetBlockGemm<Problem>();
 
         // Acc register tile
         auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index ac9c4311d..fd0de0f9c 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(image_to_column)
 add_subdirectory(gemm)
+add_subdirectory(batched_gemm)
diff --git a/test/ck_tile/batched_gemm/CMakeLists.txt b/test/ck_tile/batched_gemm/CMakeLists.txt
new file mode 100644
index 000000000..532ead112
--- /dev/null
+++ b/test/ck_tile/batched_gemm/CMakeLists.txt
@@ -0,0 +1,4 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_batched_gemm test_batched_gemm.cpp)
+endif()
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm.cpp b/test/ck_tile/batched_gemm/test_batched_gemm.cpp
new file mode 100644
index 000000000..29bed8d2f
--- /dev/null
+++ b/test/ck_tile/batched_gemm/test_batched_gemm.cpp
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_batched_gemm_util.hpp"
+
+using F16 = ck_tile::half_t;
+using F32 = float;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
+    //std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>//,
+    //std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileBatchedGemm, KernelTypes);
+
+#include "test_batched_gemm_ut_cases.inc"
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
new file mode 100644
index 000000000..f261164d6
--- /dev/null
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
@@ -0,0 +1,9 @@
+#pragma once
+
+TYPED_TEST(TestCkTileBatchedGemm, Basic)
+{
+    constexpr int M = 256;
+    constexpr int N = 128;
+    constexpr int K = 128;
+    this->Run(M, N, K);
+}
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
new file mode 100644
index 000000000..88145b987
--- /dev/null
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <sstream>
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
+
+template <typename Tuple>
+class TestCkTileBatchedGemm : public ::testing::Test
+{
+    protected:
+    using ALayout     = std::tuple_element_t<0, Tuple>;
+    using BLayout     = std::tuple_element_t<1, Tuple>;
+    using CLayout     = std::tuple_element_t<2, Tuple>;
+    using ADataType   = std::tuple_element_t<3, Tuple>;
+    using BDataType   = std::tuple_element_t<4, Tuple>;
+    using AccDataType = std::tuple_element_t<5, Tuple>;
+    using CDataType   = std::tuple_element_t<6, Tuple>;
+
+    struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs
+    {
+    };
+
+    template <typename ALayout, typename BLayout, typename CLayout>
+    void invoke_batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s)
+    {
+        // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+        constexpr bool kPadM        = false;
+        constexpr bool kPadN        = false;
+        constexpr bool kPadK        = false;
+        constexpr bool kTilePermute = false;
+        // The rank and permutation will also be generate out by the CodeGen part.
+        constexpr ck_tile::index_t kOutputRank = 2;
+
+        constexpr int kBlockPerCu = 1;
+
+        // This part comes from the Codegen
+        constexpr ck_tile::index_t M_Tile = 128;
+        constexpr ck_tile::index_t N_Tile = 128;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+        // Whether doing the CShuffle (transpose before the global memory), depending on the output
+        // layout.
+        constexpr bool CShuffleEpilogue =
+            std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
+
+        using CodegenGemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        using TilePartitioner = ck_tile::GemmTilePartitioner<CodegenGemmShape>;
+
+        using GemmEpilogue = std::conditional_t<
+            CShuffleEpilogue,
+            ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                                                       CDataType,
+                                                                       kPadM,
+                                                                       kPadN,
+                                                                       kTilePermute,
+                                                                       kOutputRank,
+                                                                       1,
+                                                                       0,
+                                                                       TilePartitioner::kM,
+                                                                       TilePartitioner::kN>>,
+            ck_tile::Default2DEpilogue<
+                ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;
+
+        using CodegenGemmTraits =
+            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
+                                                                    BDataType,
+                                                                    AccDataType,
+                                                                    CodegenGemmShape,
+                                                                    CodegenGemmTraits>;
+
+        using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+        using Kernel =
+            ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKargs(args);
+
+        const dim3 grids      = Kernel::GridSize(args);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    }
+
+    public:
+    void Run(const int M,
+             const int N,
+             const int K,
+             int StrideA            = 128,
+             int StrideB            = 128,
+             int StrideC            = 128,
+             const int BatchStrideA = 32768,
+             const int BatchStrideB = 16384,
+             const int BatchStrideC = 32768,
+             const int BatchCount   = 16)
+    {
+        using namespace ck_tile::literals;
+
+        auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                           std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           std::size_t batch_stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({batch_count_, row, col},
+                                                     {batch_stride, stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({batch_count_, row, col},
+                                                     {batch_stride, 1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    // give a chance if stride is zero, return a default packed stride
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+        StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+        StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+        ck_tile::HostTensor<ADataType> a_m_k(
+            f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{}));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        batched_gemm_kargs kargs{a_m_k_dev_buf.GetDeviceBuffer(),
+                                 b_k_n_dev_buf.GetDeviceBuffer(),
+                                 c_m_n_dev_buf.GetDeviceBuffer(),
+                                 M,
+                                 N,
+                                 K,
+                                 StrideA,
+                                 StrideB,
+                                 StrideC,
+                                 BatchStrideA,
+                                 BatchStrideB,
+                                 BatchStrideC,
+                                 BatchCount};
+
+        invoke_batched_gemm<ALayout, BLayout, CLayout>(kargs,
+                                                       ck_tile::stream_config{nullptr, false});
+
+        std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K
+                  << " StrideA =" << StrideA << " StrideB =" << StrideB << " StrideC =" << StrideC
+                  << " BatchStrideA =" << BatchStrideA << " BatchStrideB =" << BatchStrideB
+                  << " BatchStrideC =" << BatchStrideC << " BatchCount =" << BatchCount
+                  << std::endl;
+
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+        bool pass = true;
+
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+        c_m_n_host_ref.SetZero();
+
+        const auto b_n_k = b_k_n.transpose({0, 2, 1});
+        ck_tile::reference_batched_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_n_k, c_m_n_host_ref);
+
+        pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref);
+        EXPECT_TRUE(pass);
+    }
+};
-- 
GitLab


From 28e02cf5243107a8b2ea65e0a8ef0e1c4bba3964 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 29 Nov 2024 07:18:43 -0800
Subject: [PATCH 092/153] Bump rocm-docs-core from 1.9.1 to 1.9.2 in
 /docs/sphinx (#1702)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.1 to 1.9.2.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.1...v1.9.2)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 79c74cd7f..995dfaf02 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.9.1
+rocm-docs-core==1.9.2
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 426073037..d8f7c3846 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.9.1
+rocm-docs-core==1.9.2
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From cff7fab798a867c9507fafe7beccd76afd0d16d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Sat, 30 Nov 2024 05:51:09 +0100
Subject: [PATCH 093/153] [CK TILE] Fix universal gemm template keywords
 (#1704)

---
 .../ops/gemm/block/block_universal_gemm_as_bs_cr.hpp        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index c9e648f43..0fe0a9f40 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -623,7 +623,7 @@ struct BlockUniversalGemmAsBsCr
     CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
                                       const BSmemBlockWindow& b_block_window)
     {
-        block_gemm_impl_.template LocalPrefetch(a_block_window, b_block_window);
+        block_gemm_impl_.LocalPrefetch(a_block_window, b_block_window);
     }
 
     // C += A * B
@@ -632,7 +632,7 @@ struct BlockUniversalGemmAsBsCr
                                    const ASmemBlockWindow& a_block_window,
                                    const BSmemBlockWindow& b_block_window)
     {
-        block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window);
+        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window);
     }
 
     // C = A * B
@@ -641,7 +641,7 @@ struct BlockUniversalGemmAsBsCr
                                    const BSmemBlockWindow& b_block_window)
     {
         auto c_block_tensor = MakeCBlockTile();
-        block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window);
+        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window);
         return c_block_tensor;
     }
 
-- 
GitLab


From 44828b7c0f0d2d4cba5b40c8f2706f542a436aa9 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 30 Nov 2024 08:11:42 -0800
Subject: [PATCH 094/153] [Python] Add batched gemm instances parsing (#1684)

* add op

* do not insert ds parameters as they are already parsed

* reset ds parameters

* apply ruff
---
 .../batched_universal_gemm/gen_instances.py   | 149 ++++++++++++++++++
 .../ck4inductor/batched_universal_gemm/op.py  |  99 ++++++++++++
 .../grouped_conv_fwd/gen_instances.py         |   4 +-
 3 files changed, 249 insertions(+), 3 deletions(-)
 create mode 100644 python/ck4inductor/batched_universal_gemm/gen_instances.py
 create mode 100644 python/ck4inductor/batched_universal_gemm/op.py

diff --git a/python/ck4inductor/batched_universal_gemm/gen_instances.py b/python/ck4inductor/batched_universal_gemm/gen_instances.py
new file mode 100644
index 000000000..8879fb93d
--- /dev/null
+++ b/python/ck4inductor/batched_universal_gemm/gen_instances.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+import logging
+import os
+import subprocess
+from dataclasses import replace
+from functools import lru_cache
+from typing import List
+
+from ..util import library_path
+
+from .op import CKBatchedGemmOperation
+
+log = logging.getLogger(__name__)
+
+
+def _ck_library_dir():
+    gemm_instances_path = os.path.join(
+        library_path(),
+        "src",
+        "tensor_operation_instance",
+        "gpu",
+        "gemm_universal_batched",
+    )
+    if not os.path.exists(gemm_instances_path):
+        log.error("CK library path %s does not exist", gemm_instances_path)
+        return None
+    return gemm_instances_path
+
+
+def parse_instances(str_instances: List[str]) -> List[CKBatchedGemmOperation]:
+    """
+    Parse the lines containing Universal Gemm template instances into `CKBatchedGemmOperation` instances
+    """
+
+    def maybe_int(s):
+        try:
+            return int(s)
+        except ValueError:
+            return s
+
+    op_instances = []
+    for line in str_instances:
+        s_template_args = line.split("DeviceBatchedGemmMultiD_Xdl_CShuffle_V3")[
+            -1
+        ].strip("<>, ")
+        template_args = []
+        i_current = 0
+        while i_current < len(s_template_args):
+            if s_template_args[i_current] == " ":
+                # skip whitespace
+                i_current += 1
+                continue
+            elif s_template_args[i_current : i_current + 2] == "S<":
+                # parse template S<Index...>
+                i_next = s_template_args.find(">", i_current)
+                template_args.append(
+                    tuple(map(int, s_template_args[i_current + 2 : i_next].split(",")))
+                )
+                i_current = i_next + 2
+            else:
+                # all string attributes must be either type aliases or global constants in C++
+                i_next = s_template_args.find(",", i_current)
+                template_args.append(
+                    maybe_int(
+                        s_template_args[i_current : i_next if i_next != -1 else None]
+                    )
+                )
+                if i_next != -1:
+                    i_current = i_next + 1
+            if i_next == -1:
+                break
+
+        # ds layout and dtype are parsed as placeholder; reset value
+        template_args[2] = tuple()  # ds layout
+        template_args[6] = tuple()  # ds dtype
+
+        new_instance = CKBatchedGemmOperation(
+            *template_args,  # type: ignore[arg-type]
+        )
+
+        op_instances.append(new_instance)
+    return op_instances
+
+
+@lru_cache(None)
+def gen_ops_library() -> List[CKBatchedGemmOperation]:
+    """
+    Parse the Universal Gemm instances defined in the composable kernel library folder.
+    """
+    ck_library_dir = _ck_library_dir()
+    if not ck_library_dir:
+        return []
+
+    grep_result = subprocess.run(
+        [
+            "grep",
+            "-inR",
+            "DeviceBatchedGemmMultiD_Xdl_CShuffle_V3",
+            _ck_library_dir(),
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    op_instances = parse_instances(grep_result.stdout.strip().split("\n"))
+
+    log.debug("ck instances from library: %d", len(op_instances))
+
+    schedulers = [
+        "BlockGemmPipelineScheduler::Intrawave",
+        "BlockGemmPipelineScheduler::Interwave",
+    ]
+    gemm_specs = [
+        "GemmSpecialization::Default",
+        "GemmSpecialization::MPadding",
+        "GemmSpecialization::NPadding",
+        "GemmSpecialization::KPadding",
+        "GemmSpecialization::MNPadding",
+        "GemmSpecialization::MKPadding",
+        "GemmSpecialization::NKPadding",
+        "GemmSpecialization::MNKPadding",
+    ]
+
+    # substitute templated args by looping through their domains
+    substitute_instances = []
+    for instance in op_instances:
+        sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
+        sub_spec = instance.gemm_specialization == "GemmSpec"
+        schedulers_range = (
+            schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler]
+        )
+        spec_range = gemm_specs if sub_spec else [instance.gemm_specialization]
+        for scheduler in schedulers_range:
+            for spec in spec_range:
+                substitute_instances.append(
+                    replace(
+                        instance,
+                        block_gemm_pipeline_scheduler=scheduler,
+                        gemm_specialization=spec,
+                    )
+                )
+
+    return substitute_instances
+
+
+if __name__ == "__main__":
+    print(gen_ops_library())
diff --git a/python/ck4inductor/batched_universal_gemm/op.py b/python/ck4inductor/batched_universal_gemm/op.py
new file mode 100644
index 000000000..96978ac8d
--- /dev/null
+++ b/python/ck4inductor/batched_universal_gemm/op.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+from dataclasses import asdict, dataclass
+from typing import Optional, Tuple
+
+
+@dataclass
+class CKBatchedGemmOperation:
+    """
+    A python dataclass storing the template parameters of a CK Universal Gemm template instance
+    """
+
+    a_layout: str
+    b_layout: str
+    ds_layouts: Tuple[str]  # addmm specific
+    c_layout: str
+
+    a_element_dtype: str
+    b_element_dtype: str
+    ds_element_dtypes: Tuple[str]  # addmm specific
+    c_element_dtype: str
+
+    acc_dtype: str
+    c_shuffle_dtype: str
+
+    a_elementwise_op: str
+    b_elementwise_op: str
+    c_elementwise_op: str
+
+    gemm_specialization: str
+
+    block_size: int
+
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+
+    a_k1: int
+    b_k1: int
+
+    m_per_xdl: int
+    n_per_xdl: int
+
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+
+    a_block_transfer_thread_cluster_lengths_ak0_m_ak1: Tuple[int, int, int]
+    a_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
+    a_block_transfer_src_access_order: Tuple[int, int, int]
+    a_block_transfer_src_vector_dim: int
+    a_block_transfer_src_scalar_per_vector: int
+    a_block_transfer_dst_scalar_per_vector_ak1: int
+    a_block_lds_extra_m: bool
+
+    b_block_transfer_thread_cluster_lengths_bk0_n_bk1: Tuple[int, int, int]
+    b_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
+    b_block_transfer_src_access_order: Tuple[int, int, int]
+
+    b_block_transfer_src_vector_dim: int
+    b_block_transfer_src_scalar_per_vector: int
+    b_block_transfer_dst_scalar_per_vector_bk1: int
+    b_block_lds_extra_n: bool
+
+    c_shuffle_m_xdl_per_wave_per_shuffle: int
+    c_shuffle_n_xdl_per_wave_per_shuffle: int
+
+    c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block: (
+        Tuple[int, int, int, int]
+    )
+    c_shuffle_block_transfer_scalar_per_vector_n_per_block: Tuple[int]
+    block_gemm_pipeline_scheduler: str
+    block_gemm_pipeline_version: str
+
+    a_compute_dtype: Optional[str] = None
+    b_compute_dtype: Optional[str] = None
+
+    def name(self):
+        # cpp alias for template instance
+        return f"ck_device_batched_gemm_multi_d_xdl_c_shuffle_v3_{self.key_name()}"
+
+    def key_name(self):
+        # TBD; must be unique per instance. Intended to use as dict key
+        return "_".join(
+            [
+                "K"
+                + field_name.replace("_", "").lower()
+                + "V"
+                + (
+                    "x".join(map(str, iter(field_value)))
+                    if isinstance(field_value, tuple)
+                    else str(field_value).replace(":", "")
+                )
+                for field_name, field_value in self.dict_items()
+            ]
+        )
+
+    def dict_items(self):
+        return asdict(self).items()
diff --git a/python/ck4inductor/grouped_conv_fwd/gen_instances.py b/python/ck4inductor/grouped_conv_fwd/gen_instances.py
index ffbea6bdc..feca20a3b 100644
--- a/python/ck4inductor/grouped_conv_fwd/gen_instances.py
+++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py
@@ -130,9 +130,7 @@ def gen_conv_ops_library() -> List[CKGroupedConvFwdOp]:
     # substitute templated args by looping through their domains
     substitute_instances = []
     for instance in op_instances:
-        sub_scheduler = (
-            instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
-        )
+        sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
         sub_spec = instance.conv_forward_specialization == "ConvSpec"
         schedulers_range = (
             schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler]
-- 
GitLab


From 9488f1c981cda8515b45952a14e539621150c1f6 Mon Sep 17 00:00:00 2001
From: rtmadduri <rimaddur@amd.com>
Date: Mon, 2 Dec 2024 00:13:56 -0800
Subject: [PATCH 095/153] LWPCK-2429: Device grouped GEMM uses Async Memcpy
 (#1695)

* LWPCK-2429: Device grouped GEMM uses Async Memcpy
Resolving merge conflicts

* reverting changes to profile_grouped_gemm

* revert date change

---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 .../impl/device_grouped_gemm_multiple_d_dl.hpp       | 12 ++++++------
 ...gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp | 10 +++++-----
 ...rouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp |  8 ++++----
 .../gpu/device/impl/device_grouped_gemm_xdl.hpp      | 12 ++++++------
 .../impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 10 +++++-----
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index 060a16d1e..959fc890b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -603,11 +603,11 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
             }
 
             hipGetErrorString(
-                hipMemcpyWithStream(arg.p_workspace_,
-                                    arg.gemm_desc_kernel_arg_.data(),
-                                    arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_desc_kernel_arg_.data(),
+                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));
 
             auto launch_kernel = [&](auto has_main_k_block_loop,
                                      auto has_double_tail_k_block_loop) {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
index 0535c8032..d692aa05c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -761,11 +761,11 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
             float time{0.f};
 
             hip_check_error(
-                hipMemcpyWithStream(dev_gemm_kargs,
-                                    arg.gemm_kernel_args_.data(),
-                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+                hipMemcpyAsync(dev_gemm_kargs,
+                               arg.gemm_kernel_args_.data(),
+                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));
 
             auto preprocess = [&]() {
                 hip_check_error(hipMemsetAsync(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
index f673713f3..76643a690 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -940,10 +940,10 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
                              const void* p_host_kernel_args) const
     {
         arg.p_dev_gemm_args_ = p_dev_kernel_args;
-        hip_check_error(hipMemcpy(p_dev_kernel_args,
-                                  p_host_kernel_args,
-                                  GetDeviceKernelArgSize(&arg),
-                                  hipMemcpyHostToDevice));
+        hip_check_error(hipMemcpyAsync(p_dev_kernel_args,
+                                       p_host_kernel_args,
+                                       GetDeviceKernelArgSize(&arg),
+                                       hipMemcpyHostToDevice));
     }
 
     virtual void SetDeviceKernelArgs(BaseArgument* p_arg,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index 86cf1da15..424347a8b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -557,12 +557,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
-                                                  arg.gemm_desc_kernel_arg_.data(),
-                                                  arg.gemm_desc_kernel_arg_.size() *
-                                                      sizeof(GemmBiasTransKernelArg),
-                                                  hipMemcpyHostToDevice,
-                                                  stream_config.stream_id_));
+            hipGetErrorString(
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_desc_kernel_arg_.data(),
+                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));
 
             float ave_time = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 626ffbe97..f82504ffd 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -421,11 +421,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
             }
 
             hip_check_error(
-                hipMemcpyWithStream(arg.p_workspace_,
-                                    arg.gemm_kernel_args_.data(),
-                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_kernel_args_.data(),
+                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));
 
             float ave_time = 0;
 
-- 
GitLab


From 50ee4267e27b875d149e642f4cebd47be1dc3b57 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Dec 2024 07:18:35 -0800
Subject: [PATCH 096/153] Bump rocm-docs-core from 1.9.2 to 1.10.0 in
 /docs/sphinx (#1706)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.9.2 to 1.10.0.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.9.2...v1.10.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 995dfaf02..9969824d2 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.9.2
+rocm-docs-core==1.10.0
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index d8f7c3846..bb731db2d 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.9.2
+rocm-docs-core==1.10.0
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From 08d5c02c37253bf2a6852ad25f2db209f81c0fe7 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 3 Dec 2024 08:42:55 -0800
Subject: [PATCH 097/153] OCP FP8 support for gfx12. (#1710)

* (2/5) bilinear gemm pass, perf bug: skip a lds has lower performance than skip b lds

* (3/5) batched gemm pass, perf bug: skip a lds has lower performance than skip b lds

* (4/5) grouped conv pass

* (5/5) attention pass, todo: debug lds perf bug

* AIT Attention API refactor (#8)

* sanity pass

* sanity pass 2

* confirm significant performance regression.

* turn on all instances

* turn off instance format

* Fix bug & tunning & format

* DML meta, self_attn+cross_attn

* sanity pass

* remove useless flag

* update tile and problem size used in AIT attention

* bug fix in grouped conv supporting check

* deprecate inline asm wmma

* Bug fix: double lds skip

* clang-format

* Fix errors in
1. example, fmha
2. gridwise pipeline
3. deviceop, fmha, change some containers from vector to array

* part2 of previous commit

* clang format

* API fix of gridwisegemmpipeline

* separate array base and vector base attention tensor transformation

* fix gemm

* clang format

* add gemm fp16 instances

* Temp save

* fpAintB kernel compile pass

* Sanity pass.

* Temp save

* debug code enabled

* Fp16AInt8B_GEMM sanity

* MQA implementation

* GQA-4 example

* tempsave

* Compile pass

* New implementation of fp16Aint8B Gemm, Acheieve similar math throughput with native fp16 Gemm

* Bump rocm-docs-core from 0.24.0 to 0.29.0 in /docs/sphinx

Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.24.0 to 0.29.0.
- [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases)
- [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.24.0...v0.29.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* initial enablement of gfx950

* fix clang format

* disable examples 31 and 41 int8 on gfx950

* initial navi4x enablement

* remove extra endif

* enabled dl_gemm

* update s_barrier and s_waitcnt for gfx12

* fix the gfx12 assembly syntax

* fixed block_sync_lds

* add support for more dl kernels on navi4

* add wmma

* format

* Todo: fix gemm_bilinear_wmma instances compilation bug

* Solve a bug when K1=16

* remove unnecessary changes

* Remove tensor layout limitation to LDS usage in tesnor contraction

* fixed block_sync_lds

* merge navi3_ref

* update self-attention and cross-attention

* fix a typo of name

* fixed layout

* debugging

* Add arch limiter for fp8 gemm

* fixed wmma

* enable fp8 gemm_xdl for all gfx9 targets

* temporarily disable gemm_xdl_fp16_fp8 on MI100/200

* fix the cmake logic for gemm_xdl_fp16_fp8

* fixed c_output

* re-enable the gemm_xdl_fp16_fp8 on MI100/200

* fixed gfx12

* fixed

* fixed

* seperate gfx12 blockwise_gemm

* fixed

* enable fwd conv on navi4x

* enable gridwise

* enabled gemm

* fixed merge

* remove empty example fold

* fixed conflicts

* some small changes

* Update cmake-ck-dev.sh

* Update cmake-ck-dev.sh

* enabled other types

* fixed register loads

* test fa

* enable gfx12

* clean up

* enable some instances on gfx12

* add gfx1201 macro in amd_wmma header

* fix clang format

* enable batched_gemm_softmax_gemm_perm_wmma for gfx12

* disable instances with blocksize=256 in attention examples

* debuggging

* debug

* fixed lds_enabled

* debugging

* Fix and add limit to skiplds feature

* Enable skipLds feature and fix compilation bugs

* add ck_tile definitions for gfx12

* fix clang format and test/wmma_op

* updage instances cmake for gfx12

* disable the test_wmma_op on gfx12

* fix the builds for gfx950

* add gfx12 and gfx950 to default target list

* clean-up cmake file

* Initial introduction of OFP8 data types.

* Renamed FP8 and BF8 tests into FP8_FNUZ and BF8_FNUZ.

* Implementation of ConvertFP32Nearest in test_fp8_ocp.

* Remove dependence on possibly undeclared alias.

* Implement FP8OCP test for stochastic rounding mode.

* Implement FP8OCP tests for half_t type conversions.

* enable bf16 atomic add on gfx950

* Implement ConvertFP32Nearest test.

* Implement ConvertFP32Stochastic test.

* Implement ConvertFP16Nearest and ConvertFP16Stochastic tests.

* Refactoring. Move FP8 definitions into a separate header file.

* Enable easy switching between architectures.

* Fix compilation error for gfx942 architecture.

* only builf gfx950 branch for gfx950 target by default

* Enable OCP build of example_gemm_xdl_fp8.

* Fix formatting.

* fix the build logic for gfx950

* Improve GEMM example verbosity.

* Add constexpr where applicable.

* fix the logic of enabling XDL and WMMA instances

* Improve GEMM example verbosity.

* Enable build of example_gemm_xdl_fp8_bf8 test.

* Fix tests for gfx1101 architecture.

* Build DPP examples only on gfx103 and gfx11 architectures.

* Optionaly run either CPU or GPU verifications with GEMM examples.

* Extend GeneratorTensor_Sequential to produce values of prescribed data types.

* Add missing constructor.

* Improve infrastructure for OFP8 data type support.

* BUGFIX. Should not use FP8 as Compute/Accum data type.

* Add custom target for grouped_convnd_bwd_weight tests.

* Can build `tests` target on gfx950.

* Bugfixes on gfx1101 architecture.

* Fix dependencies.

* Provide single point of truth for FP8 INF and NAN checks

* Prevent instantiation of operators that are not supported by FP8 data types

* Add FP8 type selection into client_axample CMakeLists.txt

* Prevent sccache server from shutting down during build

* Fix test success reporting logic

* Change default verification method to CPU.

GPU verification takes too much time to complete on the emulator.

* Make sure all tests and examples are built for gfx950

* Facilitate testing of FP8 data types on the emulator

* Introduce two new tensor generators

* Enable instances built for gfx94 to be built on gfx950

* Verify 35_splitk_gemm on floating point numbers.

splitk gemm appears to be losing precision VS reference implementation when FP numbers are involved.

* Verify 04_gemm_add_add_fastgelu on floating point numbers

* Verify 20_grouped_conv_bwd_weight on floating point numbers

* Verify 38_grouped_conv_bwd_data_multiple_d on floating point numbers

* Verify more tests on floating point data

* Fix data types and improve testing verbocity.

* Upgrade to NPI 573 build docker.

* Skip on gemm_universal tests.

The tests take too long to complete on the emulator.
Need to see if it is possible to reduce the scope of the testing to just FP8 data types.

* Fix gfx1101 build

* Document test availability

* Re-enable fp8 gemms for gfx94/95

* Cherry-pick GEMM Universal tests for FP8 data types

* Cleanup

* CK_USE_GFX94 has already been set on this branch

* Address formatting issues and leftovers

* Make fail/pass logic consistent within 01_gemm folder

Removed multiple negations in fail/pass logic to propagate `true` as the success indicator.

* Fix GPU verification reporting logic.

* Update year in copyright notice.

* Cleanup

* Use `enum class` instead of `enum`

* Remove set_property for FP8 tests

* Narrowing the scope of PR to OCP FP8 enablement only

* Add tests for OCP FP8 vector_type storage

* Enable gemm kernel on all gfx9 architectures (#227)

* clean-up

* Implement `non_native_vector_base` with `ext_vector_type` array. (#232)

* Enable support of 1, 2, 4, and 8-byte custom types in CK.

* Fix pool tests for OCP FP8 data type

* fix jenkins file

* restore cron trigger

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: aska-0096 <haocwang@amd.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Jing Zhang <jizhan@amd.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
Co-authored-by: Jun Liu <Liu.Jun@amd.com>
Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
---
 CMakeLists.txt                                |  11 +-
 client_example/CMakeLists.txt                 |   8 +
 example/01_gemm/common.hpp                    |   2 +-
 example/01_gemm/run_gemm_example.inc          |   4 +-
 ...rouped_gemm_multiple_d_splitk_xdl_fp16.cpp |   8 +-
 .../grouped_gemm_multiple_d_xdl_fp16.cpp      |   8 +-
 .../grouped_gemm_xdl_fixed_nk_bias_fp16.cpp   |   6 +-
 .../grouped_gemm_xdl_fixed_nk_fp16.cpp        |   4 +-
 .../grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp    |   4 +-
 .../run_grouped_gemm_example.inc              |   7 +-
 ...xdl_layernorm_naive_single_kernel_fp16.cpp |   6 +-
 .../run_batched_gemm_gemm_example.inc         |   4 +-
 .../run_batched_gemm_scale_softmax_gemm.inc   |   4 +-
 ...atched_gemm_scale_softmax_gemm_permute.inc |   4 +-
 ...d_gemm_scale_softmax_gemm_permute_wmma.inc |   4 +-
 .../run_cross_attention_wmma.inc              |   4 +-
 ...rouped_gemm_scale_softmax_gemm_permute.inc |   4 +-
 ...n_grouped_query_attention_forward_wmma.inc |   4 +-
 ...run_multi_query_attention_forward_wmma.inc |   4 +-
 .../run_self_attention_wmma.inc               |   4 +-
 .../run_splitK_gemm_example.inc               |   7 +-
 ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp |   2 +-
 .../common.hpp                                |   4 +-
 .../gemm_bias_softmax_gemm_permute_xdl.cpp    |   4 +-
 ...mm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp |   8 +-
 ..._gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp |   6 +-
 ...emm_multiply_multiply_xdl_fp8_ab_scale.cpp |   3 -
 example/CMakeLists.txt                        |   7 +
 include/ck/library/utility/host_tensor.hpp    |   2 +-
 .../library/utility/host_tensor_generator.hpp |  31 +-
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp |   3 +-
 ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp |   3 +-
 include/ck/utility/amd_buffer_addressing.hpp  |  14 +-
 include/ck/utility/amd_ck_fp8.hpp             | 988 ++++++++++++++++++
 include/ck/utility/amd_xdlops.hpp             |   2 +-
 include/ck/utility/data_type.hpp              | 443 ++++++--
 include/ck/utility/math_v2.hpp                |   4 +-
 include/ck/utility/random_gen.hpp             |  13 +-
 include/ck/utility/type_convert.hpp           | 204 ++--
 .../cpu/reference_gemm.hpp                    |  10 +-
 .../gpu/CMakeLists.txt                        |   4 +-
 ...evice_max_pool3d_fwd_ndhwc_f8_instance.cpp |   4 +-
 ...ed_gemm_bias_softmax_gemm_permute_impl.hpp |   4 +-
 .../profile_batched_gemm_gemm_impl.hpp        |   4 +-
 ...profile_batched_gemm_softmax_gemm_impl.hpp |   4 +-
 ...batched_gemm_softmax_gemm_permute_impl.hpp |   4 +-
 .../include/profiler/profile_gemm_impl.hpp    |   6 +-
 test/data_type/CMakeLists.txt                 |  37 +-
 .../{test_bf8.cpp => test_bf8_fnuz.cpp}       | 135 +--
 test/data_type/test_bf8_ocp.cpp               | 268 +++++
 test/data_type/test_custom_type.cpp           | 158 +++
 .../{test_fp8.cpp => test_fp8_fnuz.cpp}       | 149 +--
 test/data_type/test_fp8_ocp.cpp               | 250 +++++
 test/pool/test_avg_pool2d_fwd.cpp             |   2 +-
 test/pool/test_max_pool2d_fwd.cpp             |   2 +-
 55 files changed, 2509 insertions(+), 384 deletions(-)
 create mode 100644 include/ck/utility/amd_ck_fp8.hpp
 rename test/data_type/{test_bf8.cpp => test_bf8_fnuz.cpp} (52%)
 create mode 100644 test/data_type/test_bf8_ocp.cpp
 rename test/data_type/{test_fp8.cpp => test_fp8_fnuz.cpp} (52%)
 create mode 100644 test/data_type/test_fp8_ocp.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b28a6d912..2c8698756 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,13 +185,22 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     add_definitions(-DCK_USE_XDL)
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
-    message("Enabling FP8 gemms in ckProfiler")
+    message("Enabling FP8 gemms on native architectures")
     add_definitions(-DCK_USE_GFX94)
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     message("Enabling WMMA instances")
     add_definitions(-DCK_USE_WMMA)
 endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+    add_definitions(-DCK_USE_OCP_FP8)
+    set(CK_USE_OCP_FP8 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+    add_definitions(-DCK_USE_FNUZ_FP8)
+    set(CK_USE_FNUZ_FP8 "ON")
+endif()
+
 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
 if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
     add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
index c393972b4..ce5834d1e 100644
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -56,6 +56,14 @@ if (GPU_TARGETS)
         add_definitions(-DCK_USE_WMMA)
         set(CK_USE_WMMA "ON")
     endif()
+    if (GPU_TARGETS MATCHES "gfx12")
+        add_definitions(-DCK_USE_OCP_FP8)
+        set(CK_USE_OCP_FP8 "ON")
+    endif()
+    if (GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94")
+        add_definitions(-DCK_USE_FNUZ_FP8)
+        set(CK_USE_FNUZ_FP8 "ON")
+    endif()
 else()
     add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
     set(CK_USE_XDL "ON")
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index 67bf92bbb..a3a62d4cf 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -76,7 +76,7 @@ struct ProblemSizeSplitK final
 struct ExecutionConfig final
 {
     // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
-    int do_verification = 3;
+    int do_verification = 1;
     int init_method     = 2;
     bool time_kernel    = false;
 };
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index bafec3f35..3ee6e2685 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -143,8 +143,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     switch(config.init_method)
     {
     case 0:
-        ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
-        ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
+        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.f)}(a_m_k);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(1.f)}(b_k_n);
         break;
     case 1:
         ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
index 8bbf8e629..117a18e3b 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
@@ -186,15 +186,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
             for(int j = 0; j < NumDMatrices; ++j)
             {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
             }
             break;
         default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
             for(int j = 0; j < NumDMatrices; ++j)
             {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<DDataType, 0>{});
             }
         }
     }
diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
index e7b2ee417..db162fe44 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -190,15 +190,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
             for(int j = 0; j < NumDs; ++j)
             {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
             }
             break;
         default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
             for(int j = 0; j < NumDs; ++j)
             {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<DDataType, 0>{});
             }
         }
     }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
index 3b3ef508c..5bdc99319 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
@@ -167,11 +167,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
             break;
         default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
         }
 
-        d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<D0DataType, 1>{});
     }
 
     using GroupedGemmKernelArgument = ck::tensor_operation::device::GroupedGemmKernelArgument<1>;
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
index c1043f419..6806bd188 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
             break;
         default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
         }
     }
 
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
index c81874b06..8418c10f5 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -158,8 +158,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
             break;
         default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
         }
     }
 
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index 7cb0588b8..64125cd1d 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once
 
 struct ProblemSize final
@@ -124,8 +127,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
             break;
         default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
         }
     }
 
diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
index 90d80f9f0..277fea027 100644
--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         break;
     default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
     }
 
     c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});
diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
index f32914672..d54550868 100644
--- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -150,7 +150,7 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
         break;
     default:
         a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
 
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
index 27602e231..1514fc48b 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -157,7 +157,7 @@ int run(int argc, char* argv[])
         break;
     default:
         a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
 
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
index fa76faea8..2b02069e6 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -118,7 +118,7 @@ int run(int argc, char* argv[])
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
         break;
     default:
-        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
         b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc
index 2e77479bc..e0ccb6dad 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -153,7 +153,7 @@ int run(int argc, char* argv[])
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
         break;
     default:
-        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
         b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc
index 9ff4c56e0..0ad031cc7 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -178,7 +178,7 @@ int run(int argc, char* argv[])
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
         break;
     default:
-        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
         b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
index ea1e2734a..cdfd86dff 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -152,7 +152,7 @@ int run(int argc, char* argv[])
             break;
         default:
             a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-            b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
             b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
         }
 
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc
index 609d08529..7ac29f33c 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -156,7 +156,7 @@ int run(int argc, char* argv[])
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
         break;
     default:
-        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
         b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc
index b05915c07..fb9b1b0bd 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -156,7 +156,7 @@ int run(int argc, char* argv[])
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
         break;
     default:
-        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
         b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc
index 3fdaaebb0..2cb69380e 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 int run(int argc, char* argv[])
 {
@@ -173,7 +173,7 @@ int run(int argc, char* argv[])
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
         break;
     default:
-        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
         b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
diff --git a/example/35_splitK_gemm/run_splitK_gemm_example.inc b/example/35_splitK_gemm/run_splitK_gemm_example.inc
index e3690984a..cb1d3410c 100644
--- a/example/35_splitK_gemm/run_splitK_gemm_example.inc
+++ b/example/35_splitK_gemm/run_splitK_gemm_example.inc
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once
 
 struct ProblemSize final
@@ -66,8 +69,8 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         break;
     default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
     }
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
index ff1282f3c..f27dc6054 100644
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
@@ -377,7 +377,7 @@ int main(int argc, char* argv[])
         break;
     default:
         a0_g_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         d00_g_m_n.GenerateTensorValue(GeneratorTensor_1<D00DataType>{1});
         d01_g_m_n.GenerateTensorValue(GeneratorTensor_1<D01DataType>{1});
         b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
index 8a0474156..6af8ac648 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -41,7 +41,7 @@ struct ExecutionConfig final
 {
     bool do_verification = true;
     int init_method      = 1;
-    bool time_kernel     = true;
+    bool time_kernel     = false;
 };
 
 #define DefaultConvParams                                                                \
diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp
index a90a6340a..392cb155c 100644
--- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp
+++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <vector>
@@ -248,7 +248,7 @@ int main(int argc, char* argv[])
         d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
         break;
     default:
-        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 2>{});
         b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
         d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
index 742fd5547..055d25304 100644
--- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
+++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -194,9 +194,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b1_tensors[i].GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
             break;
         default:
-            a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
-            b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<A0DataType, 0>{});
+            b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
+            b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<B1DataType, 1>{});
         }
 
         d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp
index 809c1a956..1ba8133ea 100644
--- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp
@@ -184,9 +184,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             b_tensors[i].GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
             break;
         default:
-            a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<A0DataType, 0>{});
+            a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<A1DataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         }
 
         d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
index 256875464..9b7849a65 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
@@ -205,7 +205,6 @@ int main(int argc, char* argv[])
     a1_device_buf.ToDevice(a1_m_k.mData.data());
     b0_device_buf.ToDevice(b0_k_n.mData.data());
     b1_device_buf.ToDevice(b1_k_n.mData.data());
-    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
 
     auto a_element_op   = AElementOp{};
     auto b_element_op   = BElementOp{};
@@ -253,8 +252,6 @@ int main(int argc, char* argv[])
     std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
               << std::endl;
 
-    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-
     if(do_verification)
     {
         Tensor<AccDataType> c_m_n({M, N});
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index ea739c707..72759916a 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -54,6 +54,13 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
+    #Do not build any DPP examples if DL_KERNELS not set
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dpp")
+            message("removing dpp example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
     #Do not build any XDL examples if gfx9 targets are not on the list
     foreach(source IN LISTS FILE_NAME)
         if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index a58acaf11..18e1db462 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -326,7 +326,7 @@ struct Tensor
 
     std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
 
-    void SetZero() { ck::ranges::fill<T>(mData, 0); }
+    void SetZero() { ck::ranges::fill<T>(mData, T{0}); }
 
     template <typename F>
     void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp
index e87811b76..ab9f01b53 100644
--- a/include/ck/library/utility/host_tensor_generator.hpp
+++ b/include/ck/library/utility/host_tensor_generator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -37,7 +37,7 @@ struct GeneratorTensor_1<ck::half_t>
     float value = 1.0;
 
     template <typename... Is>
-    ck::bhalf_t operator()(Is...)
+    ck::half_t operator()(Is...)
     {
         return ck::type_convert<ck::half_t>(value);
     }
@@ -62,7 +62,7 @@ struct GeneratorTensor_1<ck::f8_t>
     float value = 1.0;
 
     template <typename... Is>
-    ck::bhalf_t operator()(Is...)
+    ck::f8_t operator()(Is...)
     {
         return ck::type_convert<ck::f8_t>(value);
     }
@@ -256,14 +256,33 @@ struct GeneratorTensor_Checkboard
     }
 };
 
-template <ck::index_t Dim>
+/**
+ * @brief Is used to generate sequential values based on the specified dimension.
+ *
+ * @tparam T The type of the tensor values.
+ * @tparam Dim The specific dimension used for generation.
+ *
+ * GeneratorTensor_Sequential<1>{} will generate the following values for a 3x3 tensor:
+ *
+ * 0 1 2
+ * 0 1 2
+ * 0 1 2
+ *
+ * Essentially, the values generated are logical coordinates of the generated element that
+ * correspond to dimension Dim. E.g. for 2-dimensional tensor and Dim=1, the values are the column
+ * indices.
+ *
+ */
+template <typename T, ck::index_t Dim>
 struct GeneratorTensor_Sequential
 {
     template <typename... Ts>
-    float operator()(Ts... Xs) const
+    T operator()(Ts... Xs) const
     {
         std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
-        return dims[Dim];
+
+        float tmp = dims[Dim];
+        return ck::type_convert<T>(tmp);
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index c1f58ccda..a7f129b2b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -111,8 +111,7 @@ __global__ void
             [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
             [[maybe_unused]] const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
     const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index da6b1b304..813acfa65 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -38,8 +38,7 @@ __global__ void
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index d4ee5c886..5367c3d72 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
             (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, fp8_storage_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
         "wrong! not implemented");
 
@@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
 
 #else
 
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};
     return src_thread_element_valid ? tmp : vector_t(0);
 #endif
 }
@@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
 
     constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
 
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};
 
     return src_thread_element_valid ? tmp : vector_t(customized_value);
 }
diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
new file mode 100644
index 000000000..7b21ad646
--- /dev/null
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -0,0 +1,988 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/random_gen.hpp"
+#include "ck/utility/type.hpp"
+
+#ifdef CK_USE_FNUZ_FP8
+#define CK_USE_FNUZ_FP8 1
+#else
+#define CK_USE_FNUZ_FP8 0
+#endif
+
+#ifdef CK_USE_OCP_FP8
+#define CK_USE_OCP_FP8 1
+#else
+#define CK_USE_OCP_FP8 0
+#endif
+
+namespace ck {
+
+using f8_fnuz_t  = _BitInt(8);
+using bf8_fnuz_t = unsigned _BitInt(8);
+
+#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \
+    defined(__gfx1201__)) &&                                                                     \
+    __HIP_DEVICE_COMPILE__
+#define CK_FP8_CVT_FAST_PATH 1
+#else
+#define CK_FP8_CVT_FAST_PATH 0
+#endif
+
+#if(defined(__gfx1200__) || defined(__gfx1201__)) && __HIP_DEVICE_COMPILE__
+#define CK_OCP_FP8_CVT_FAST_PATH 1
+#else
+#define CK_OCP_FP8_CVT_FAST_PATH 0
+#endif
+
+typedef unsigned char fp8_storage_t;
+
+/**
+ * \brief Describes FP8 interpretation
+ */
+enum class ck_fp8_interpretation_t
+{
+    CK_E4M3_OCP  = 0, // OCP E4M3
+    CK_E5M2_OCP  = 1, // OCP E5M2
+    CK_E4M3_FNUZ = 2, // FP8
+    CK_E5M2_FNUZ = 3, // BF8
+};
+
+/**
+ * \brief Describes saturation behavior
+ */
+enum class ck_saturation_t
+{
+    CK_NOSAT     = 0, // No saturation - replace with NaN or Inf
+    CK_SATFINITE = 1, // Saturate to finite
+};
+
+namespace fp8_impl {
+
+typedef fp8_storage_t fp8x2_storage_t __attribute__((ext_vector_type(2)));
+typedef float float2_t __attribute__((ext_vector_type(2)));
+
+__host__ __device__ static inline constexpr bool fnuz_f8_is_nan(f8_fnuz_t a)
+{
+    return static_cast<unsigned char>(a) == 0x80;
+}
+__host__ __device__ static inline constexpr bool fnuz_bf8_is_nan(bf8_fnuz_t a)
+{
+    return static_cast<unsigned char>(a) == 0x80;
+}
+
+__host__ __device__ static inline constexpr bool ocp_f8_is_nan(fp8_storage_t a)
+{
+    return (a & 0x7f) == 0x7f;
+}
+__host__ __device__ static inline constexpr bool ocp_bf8_is_nan(fp8_storage_t a)
+{
+    return (a & 0x7f) > 0x7c;
+}
+
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220
+// This has been modified to handle double types as well
+template <typename T, int wm, int we, bool is_fnuz, bool clip = false>
+__host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
+{
+    constexpr bool is_half   = __hip_internal::is_same<T, _Float16>::value;
+    constexpr bool is_float  = __hip_internal::is_same<T, float>::value;
+    constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+    static_assert(is_half || is_float || is_double, "only half, float and double are supported");
+
+    constexpr int weo = is_half ? 5 : (is_float ? 8 : 11);
+    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 52);
+
+    T fInf, fNegInf, fNaN, fNeg0, fmax, fmin;
+    if constexpr(is_half)
+    {
+        const unsigned short int ihInf    = 0x7C00;
+        const unsigned short int ihNegInf = 0xFC00;
+        const unsigned short int ihNaN    = 0x7C01;
+        const unsigned short int ihNeg0   = 0x8000;
+        /* Max number in e5m2 57344*/
+        const unsigned short int ifmax = 0x7B00;
+        const unsigned short int ifmin = 0xFB00;
+
+        fInf    = bit_cast<_Float16>(ihInf);
+        fNegInf = bit_cast<_Float16>(ihNegInf);
+        fNaN    = bit_cast<_Float16>(ihNaN);
+        fNeg0   = bit_cast<_Float16>(ihNeg0);
+        fmax    = bit_cast<_Float16>(ifmax);
+        fmin    = bit_cast<_Float16>(ifmin);
+    }
+    else if constexpr(is_float)
+    {
+        const unsigned int ifInf    = 0x7F800000;
+        const unsigned int ifNegInf = 0xFF800000;
+        const unsigned int ifNaN    = 0x7F800001;
+        const unsigned int ifNeg0   = 0x80000000;
+        /* Max number in e5m2 57344*/
+        const unsigned int ifmax = 0x47600000;
+        const unsigned int ifmin = 0xC7600000;
+
+        fInf    = bit_cast<float>(ifInf);
+        fNegInf = bit_cast<float>(ifNegInf);
+        fNaN    = bit_cast<float>(ifNaN);
+        fNeg0   = bit_cast<float>(ifNeg0);
+        fmax    = bit_cast<float>(ifmax);
+        fmin    = bit_cast<float>(ifmin);
+    }
+    else if constexpr(is_double)
+    {
+        const unsigned long long ifInf    = 0x7FF0000000000000ull;
+        const unsigned long long ifNegInf = 0xFFF0000000000000ull;
+        const unsigned long long ifNaN    = 0x7FF0000000000001ull;
+        const unsigned long long ifNeg0   = 0x8000000000000000ull;
+        /* Max number in e5m2 57344*/
+        const unsigned long long ifmax = 0x40EC000000000000ull;
+        const unsigned long long ifmin = 0xC0EC000000000000ull;
+
+        fInf    = bit_cast<double>(ifInf);
+        fNegInf = bit_cast<double>(ifNegInf);
+        fNaN    = bit_cast<double>(ifNaN);
+        fNeg0   = bit_cast<double>(ifNeg0);
+        fmax    = bit_cast<double>(ifmax);
+        fmin    = bit_cast<double>(ifmin);
+    }
+
+    if(x == 0)
+    {
+        return 0;
+    }
+
+    unsigned long long sign     = x >> 7;
+    unsigned long long mantissa = x & ((1 << wm) - 1);
+    int exponent                = (x & 0x7F) >> wm;
+    if constexpr(is_fnuz)
+    {
+        if(x == 0x80)
+        {
+            return fNaN;
+        }
+    }
+    else
+    {
+        if(x == 0x80)
+        {
+            return fNeg0;
+        }
+        if constexpr(we == 4)
+        { // e4m3
+            if((x & 0x7F) == 0x7F)
+            {
+                return fNaN;
+            }
+        }
+        else if((x & 0x7C) == 0x7C)
+        { // e5m2
+            if((x & 0x3) == 0)
+            {
+                if constexpr(clip)
+                {
+                    return sign ? fmin : fmax;
+                }
+                return sign ? fNegInf : fInf;
+            }
+            return fNaN;
+        }
+    }
+
+    typename __hip_internal::conditional<
+        sizeof(T) == 2,
+        unsigned short int,
+        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+            type>::type retval;
+
+    if constexpr(we == 5 && is_half && !is_fnuz)
+    {
+        retval = x << 8;
+        return bit_cast<T>(retval);
+    }
+
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (is_fnuz ? 1 : 0);
+
+    // subnormal input
+    if(exponent == 0)
+    {
+#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __clz(mantissa) - (32 - wm);
+#else
+        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+#endif
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        mantissa &= ((1ull << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    if constexpr(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa;
+    else if constexpr(sizeof(T) == 4)
+        retval = (sign << 31) | (exponent << 23) | mantissa;
+    else
+        retval = (sign << 63) | (static_cast<unsigned long long>(exponent) << 52) | mantissa;
+
+    return bit_cast<T>(retval);
+}
+
+#if CK_FP8_CVT_FAST_PATH
+template <ck_fp8_interpretation_t interpret>
+static __device__ float cast_to_f32_from_f8(fp8_storage_t v)
+{
+    union
+    {
+        unsigned int i32val;
+        unsigned char i8val[4];
+    } val;
+    val.i8val[0] = v;
+
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only FNUZ and OCP interpretations are supported");
+
+    if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                 (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+    {
+        return __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
+    }
+}
+
+template <ck_fp8_interpretation_t interpret>
+static __device__ float2_t cast_to_f32x2_from_f8x2(fp8x2_storage_t v)
+{
+    const auto i16val = bit_cast<uint16_t>(v);
+
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only FNUZ and OCP interpretations are supported");
+
+    if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                 (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+    {
+        return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, false);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_pk_f32_bf8(i16val, false);
+    }
+}
+
+#endif
+
+} // namespace fp8_impl
+
+struct f8_ocp_t
+{
+    using data_type = fp8_storage_t;
+    data_type data;
+
+    static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE;
+    static constexpr ck_fp8_interpretation_t default_interpret =
+        ck_fp8_interpretation_t::CK_E4M3_OCP;
+
+    static constexpr unsigned int we = 4; // exponent width
+    static constexpr unsigned int wm = 3; // mantissa width
+
+    __host__ __device__ constexpr bool operator==(const f8_ocp_t& other) const
+    {
+        return (data == other.data) && (fp8_impl::ocp_f8_is_nan(data) == false); // NaN != NaN
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator float() const
+#else
+    __host__ explicit operator float() const
+#endif
+    {
+#if CK_OCP_FP8_CVT_FAST_PATH
+        return fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data);
+#else
+        return fp8_impl::cast_from_f8<float, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator _Float16
+#endif
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator _Float16() const
+#else
+    __host__ explicit operator _Float16() const
+#endif
+    {
+#if CK_OCP_FP8_CVT_FAST_PATH
+        return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data));
+#else
+        return fp8_impl::cast_from_f8<_Float16, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator float
+#endif
+    }
+};
+
+struct bf8_ocp_t
+{
+    using data_type = fp8_storage_t;
+    data_type data;
+
+    static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE;
+    static constexpr ck_fp8_interpretation_t default_interpret =
+        ck_fp8_interpretation_t::CK_E5M2_OCP;
+
+    static constexpr unsigned int we = 5; // exponent width
+    static constexpr unsigned int wm = 2; // mantissa width
+
+    __host__ __device__ constexpr bool operator==(const bf8_ocp_t& other) const
+    {
+        return (data == other.data) && (fp8_impl::ocp_bf8_is_nan(data) == false); // NaN != NaN
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator float() const
+
+#else
+    __host__ explicit operator float() const
+#endif
+    {
+#if defined(__gfx1200__) || defined(__gfx1201__)
+        return fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data);
+#else
+        return fp8_impl::cast_from_f8<float, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator _Float16
+#endif
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator _Float16() const
+#else
+    __host__ explicit operator _Float16() const
+#endif
+    {
+#if defined(__gfx1200__) || defined(__gfx1201__)
+        return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data));
+#else
+        return fp8_impl::cast_from_f8<_Float16, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator float
+#endif
+    }
+};
+
+template <typename T>
+__host__ __device__ static inline constexpr bool fp8_is_nan(T);
+
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(f8_ocp_t a)
+{
+    return fp8_impl::ocp_f8_is_nan(a.data);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(bf8_ocp_t a)
+{
+    return fp8_impl::ocp_bf8_is_nan(a.data);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(f8_fnuz_t a)
+{
+    return fp8_impl::fnuz_f8_is_nan(a);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a)
+{
+    return fp8_impl::fnuz_bf8_is_nan(a);
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same_v<T, bf8_ocp_t> || std::is_same_v<T, f8_ocp_t> ||
+                               std::is_same_v<T, bf8_fnuz_t> || std::is_same_v<T, f8_fnuz_t>,
+                           bool> = true>
+__host__ __device__ static inline constexpr bool fp8_is_inf(T)
+{
+    return false;
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_inf(bf8_ocp_t a)
+{
+    return (a.data & 0x7f) == 0x7c;
+}
+
+namespace fp8_impl {
+
+// Assertions to check for supported conversion types
+#define __assert_ocp_support(interp)                                               \
+    {                                                                              \
+        if(interp != ck_fp8_interpretation_t::CK_E4M3_OCP &&                       \
+           interp != ck_fp8_interpretation_t::CK_E5M2_OCP)                         \
+        {                                                                          \
+            __hip_assert(false && "type is unsupported by current target device"); \
+        }                                                                          \
+    }
+#define __assert_fnuz_support(interp)                                              \
+    {                                                                              \
+        if(interp != ck_fp8_interpretation_t::CK_E4M3_FNUZ &&                      \
+           interp != ck_fp8_interpretation_t::CK_E5M2_FNUZ)                        \
+        {                                                                          \
+            __hip_assert(false && "type is unsupported by current target device"); \
+        }                                                                          \
+    }
+
+__host__ __device__ static inline void
+__is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp)
+{
+#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
+#if CK_USE_OCP_FP8
+    __assert_ocp_support(interp);
+#endif
+#if CK_USE_FNUZ_FP8
+    __assert_fnuz_support(interp);
+#endif
+#endif
+}
+
+#if CK_FP8_CVT_FAST_PATH
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79
+template <ck_fp8_interpretation_t interpret, bool saturate, bool stochastic_rounding = false>
+static __device__ fp8_storage_t cast_to_f8_from_f32(float v, unsigned int rng = 0)
+{
+    fp8_storage_t i8data;
+    union
+    {
+        float fval;
+        unsigned int i32val;
+        unsigned char i8val[4]; // NOTE: not endian independent
+    } val;
+
+    unsigned int ival = 0;
+    val.fval          = v;
+
+    if constexpr(saturate)
+    {
+        if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+            }
+        }
+        else if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+        { // OCP type
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 448.0, -448.0);
+            }
+        }
+        else
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+            }
+        }
+    }
+
+    if constexpr(stochastic_rounding)
+    {
+        ival       = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                       (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+                         ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
+                         : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
+        val.i32val = ival;
+        i8data     = val.i8val[0]; // little endian
+    }
+    else
+    { // RNE CVT
+        ival       = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                       (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+                         ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
+                         : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval,
+                                                     val.fval,
+                                                     ival,
+                                                     false); // false -> WORD0
+        val.i32val = ival;
+        i8data     = val.i8val[0];
+    }
+    return i8data;
+}
+#endif // CK_FP8_CVT_FAST_PATH
+
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L39
+// This has been modified to add double types conversion as well
+template <typename T, int wm, int we, bool is_fnuz, bool clip = false, bool stoch = false>
+__host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rng = 0)
+{
+    constexpr bool is_half   = __hip_internal::is_same<T, _Float16>::value;
+    constexpr bool is_float  = __hip_internal::is_same<T, float>::value;
+    constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+    static_assert(is_half || is_float || is_double,
+                  "Only half, float and double can be cast to f8");
+
+    constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10);
+
+    using T_bitwise = typename __hip_internal::conditional<
+        sizeof(T) == 2,
+        unsigned short int,
+        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+            type>::type;
+    T_bitwise x_bitwise = bit_cast<T_bitwise>(_x);
+
+    unsigned long long x{x_bitwise};
+
+    unsigned long long head, mantissa;
+    int exponent, bias;
+    unsigned int sign;
+    unsigned long long fInf, mask;
+
+    if constexpr(sizeof(T) == 8)
+    {
+        head     = x & 0xFFF0000000000000ull;
+        mantissa = x & 0xFFFFFFFFFFFFFull;
+        exponent = (head >> 52) & 0x7FF;
+        sign     = head >> 63;
+        bias     = 1023;
+        fInf     = 0x7FF0000000000000ull;
+        mask     = 0x7FFFFFFFFFFFFFFFull;
+    }
+    else if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+        fInf     = 0x7F800000;
+        mask     = 0x7FFFFFFF;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+        fInf     = 0x7C00;
+        mask     = 0x7FFF;
+    }
+    unsigned int signed_inf = 0;
+    unsigned int nan        = 0;
+    if constexpr(is_fnuz)
+    {
+        signed_inf = clip ? ((sign << 7) + 0x7f) : 0x80;
+        nan        = 0x80;
+    }
+    else
+    {
+        if constexpr(we == 4)
+        { // e4m3
+            signed_inf = (sign << 7) + (clip ? 0x7e : 0x7f);
+        }
+        else
+        { // e5m2
+            signed_inf = (sign << 7) + (clip ? 0x7b : 0x7c);
+        }
+        nan = (sign << 7) + 0x7f;
+    }
+    // Max values
+    unsigned long long ifmax = 0;
+    if constexpr(sizeof(T) == 8)
+    {
+        if constexpr(we == 5)
+        { // 57344
+            ifmax = 0x40EC000000000000ull;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            { // 240
+                ifmax = 0x406E000000000000ull;
+            }
+            else
+            { // 448
+                ifmax = 0x407C000000000000ull;
+            }
+        }
+    }
+    else if(sizeof(T) == 4)
+    {
+        if constexpr(we == 5)
+        {
+            ifmax = 0x47600000;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            {
+                ifmax = 0x43700000;
+            }
+            else
+            {
+                ifmax = 0x43E00000;
+            }
+        }
+    }
+    else
+    {
+        if constexpr(we == 5)
+        {
+            ifmax = 0x7B00;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            {
+                ifmax = 0x5B80;
+            }
+            else
+            {
+                ifmax = 0x5F00;
+            }
+        }
+    }
+    // Deal with inf and NaNs
+    if((x & fInf) == fInf)
+    {
+        if constexpr(is_fnuz)
+            return signed_inf;
+
+        return mantissa != 0 ? nan : signed_inf;
+    }
+
+    if((x & mask) > ifmax)
+    {
+        return signed_inf;
+    }
+
+    if(x == 0)
+    {
+        return 0;
+    }
+
+    // First need to check if it is normal or denorm as there is a difference of
+    // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
+    // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
+    // to mantissa and truncate. And for RNE, no need to add rng. Then probably
+    // need to check whether there is carry and adjust exponent and mantissa again
+
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
+    // bits
+    const int f8_bias                  = (1 << (we - 1)) - 1 + (is_fnuz ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    // f8_exponent is the converted f8 exponent with bias encoding
+    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    // the difference needs to be adjusted and mantissa shifted
+    int act_exponent, f8_exponent, exponent_diff;
+
+    if(exponent == 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
+    mostly concern fp16 here. In this case, f8 is usually in denormal. But there
+    could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
+    exponent bias 16. It means that there are some numbers in fp16 denormal but they
+    are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
+    where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
+    (NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal
+      range. For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+      actual exponent is -7, it is actually larger due to the implicit 1,
+      Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+      So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {                      // both fp32/fp16 and f8 are in normal range
+            exponent_diff = 0; // exponent_diff=0 does not mean there is no difference
+                               // for this case, act_exponent could be larger. Just
+                               // that it does not need shift mantissa
+        }
+        mantissa += (1ull << mfmt); // Add the implicit 1 into mantissa
+    }
+
+    bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) ==
+                    (1ull << (mfmt - wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be
+  done before we shift right as shift right could rip off some residual part and
+  make something not midpoint look like midpoint. For example, the fp16 number
+  0x1002 (0 00100 0000000010), it is larger than midpoint, but after shift right
+  by 4 bits, it would look like midpoint.
+  */
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1ull << mfmt);
+    // if there is no implicit 1, it  means the f8 is denormal and need to adjust
+    // to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+    // Now we have the exponent and mantissa adjusted
+    unsigned long long drop_mask = (1ull << (mfmt - wm)) - 1;
+    bool odd =
+        mantissa & (1ull << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+    mantissa +=
+        (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask;
+
+    // Now we deal with overflow
+    if(f8_exponent == 0)
+    {
+        if((1ull << mfmt) & mantissa)
+        {
+            f8_exponent = 1; // denormal overflow to become normal, promote exponent
+        }
+    }
+    else
+    {
+        if((1ull << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            f8_exponent++;
+        }
+    }
+
+    mantissa >>= (mfmt - wm);
+
+    // above range: quantize to maximum possible float of the same sign
+    const int max_exp = (1 << we) - 1;
+    if(f8_exponent > max_exp)
+    {
+        if constexpr(clip)
+        {
+            mantissa    = (1 << wm) - 1;
+            f8_exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+
+    if(f8_exponent == 0 && mantissa == 0)
+        return is_fnuz ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+
+/**
+ * \brief convert float to @p fp8_storage_t
+ *
+ * \tparam interp interpretation of fp8
+ * \tparam sat saturation of fp8
+ * \param f float number
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+    __is_interpret_supported(interp);
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+    return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+        f, rng);
+#else
+#if CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+#else
+__host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+#endif
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+
+    if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+    {
+        return cast_to_f8<float,
+                          3,
+                          4,
+                          true,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_FNUZ)
+    {
+        return cast_to_f8<float,
+                          2,
+                          5,
+                          true,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_OCP)
+    {
+        return cast_to_f8<float,
+                          3,
+                          4,
+                          false,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_OCP)
+    {
+        return cast_to_f8<float,
+                          2,
+                          5,
+                          false,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else
+    {
+        __hip_assert(false && "FP8 type is not supported by current target device");
+        return 0;
+    }
+#endif // CK_FP8_CVT_FAST_PATH
+}
+
+/**
+ * \brief convert _Float16 to @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x _Float16 value
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
+#else
+__host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
+#endif
+{
+    return cvt_float_to_fp8<interp, sat, stochastic_rounding>(static_cast<float>(x));
+}
+
+} // namespace fp8_impl
+
+// Declare a template function for fp8 conversion using RNE
+template <typename Y, typename X>
+__host__ __device__ constexpr Y f8_convert_rne(X x);
+
+// convert fp32 to fp8 with rounding to nearest even
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+// convert fp32 to bf8 with rounding to nearest even
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
+}
+
+// convert _Float16 to fp8 with rounding to nearest even
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, _Float16>(_Float16 x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, _Float16>(_Float16 x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+// Declare a template function for fp8 conversion using RNE
+template <typename Y, typename X>
+__host__ __device__ constexpr Y f8_convert_sr(X x);
+
+// convert fp32 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
+            x)};
+}
+
+// convert fp32 to bf8 with stochastic rounding
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
+                                                bf8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+// convert _Float16 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, _Float16>(_Float16 x)
+{
+    return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
+                                                f8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+// convert _Float16 to bf8 with stochastic rounding
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, _Float16>(_Float16 x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                 bf8_ocp_t::default_saturation,
+                                                 true>(x)};
+}
+
+#if CK_USE_OCP_FP8
+using f8_t  = f8_ocp_t;
+using bf8_t = bf8_ocp_t;
+#define CK_FP8_TYPE_FNUZ 0
+#define CK_FP8_TYPE_OCP 1
+#else
+using f8_t = f8_fnuz_t;
+using bf8_t = bf8_fnuz_t;
+#define CK_FP8_TYPE_FNUZ 1
+#define CK_FP8_TYPE_OCP 0
+#endif
+
+} // namespace ck
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index a955279bc..5a7030cca 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -4,7 +4,7 @@
 #pragma once
 
 namespace ck {
-// Define the common macro for gfx94x models
+// Define the common macro for MI300 models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 39f532e0e..a7dc071bc 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "ck/utility/amd_ck_fp8.hpp"
 #include "ck/utility/statically_indexed_array.hpp"
 
 namespace ck {
@@ -10,8 +11,6 @@ namespace ck {
 using bhalf_t = ushort;
 using half_t  = _Float16;
 using int4_t  = _BitInt(4);
-using f8_t    = _BitInt(8);
-using bf8_t   = unsigned _BitInt(8);
 
 inline constexpr auto next_pow2(uint32_t x)
 {
@@ -19,14 +18,15 @@ inline constexpr auto next_pow2(uint32_t x)
     return x > 1u ? (1u << (32u - __builtin_clz(x - 1u))) : x;
 }
 
-// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_t, bf8_t, bool
+// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_fnuz_t, bf8_fnuz_t,
+// native types: bool
 template <typename T>
 inline constexpr bool is_native_type()
 {
     return is_same<T, double>::value || is_same<T, float>::value || is_same<T, half_t>::value ||
            is_same<T, bhalf_t>::value || is_same<T, int32_t>::value || is_same<T, int8_t>::value ||
-           is_same<T, uint8_t>::value || is_same<T, f8_t>::value || is_same<T, bf8_t>::value ||
-           is_same<T, bool>::value;
+           is_same<T, uint8_t>::value || is_same<T, f8_fnuz_t>::value ||
+           is_same<T, bf8_fnuz_t>::value || is_same<T, bool>::value;
 }
 
 // vector_type
@@ -166,16 +166,30 @@ struct scalar_type<int4_t>
 #endif
 
 template <>
-struct scalar_type<f8_t>
+struct scalar_type<f8_fnuz_t>
 {
-    using type                           = f8_t;
+    using type                           = f8_fnuz_t;
     static constexpr index_t vector_size = 1;
 };
 
 template <>
-struct scalar_type<bf8_t>
+struct scalar_type<bf8_fnuz_t>
 {
-    using type                           = bf8_t;
+    using type                           = bf8_fnuz_t;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<f8_ocp_t>
+{
+    using type                           = f8_ocp_t::data_type;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<bf8_ocp_t>
+{
+    using type                           = bf8_ocp_t::data_type;
     static constexpr index_t vector_size = 1;
 };
 
@@ -1010,60 +1024,203 @@ struct vector_type<T, 256, typename std::enable_if_t<is_native_type<T>()>>
     }
 };
 
+template <typename T, index_t N, typename Enable = void>
+struct non_native_vector_base;
+
+template <typename T>
+struct nnvb_data_t_selector
+{
+    using type = unsigned _BitInt(8 * sizeof(T));
+};
+
+template <>
+struct nnvb_data_t_selector<f8_ocp_t>
+{
+    using type = f8_ocp_t::data_type;
+};
+template <>
+struct nnvb_data_t_selector<bf8_ocp_t>
+{
+    using type = bf8_ocp_t::data_type;
+};
+
+template <typename T, index_t N>
+struct non_native_vector_base<
+    T,
+    N,
+    std::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>>
+{
+    using data_t = typename nnvb_data_t_selector<T>::type; // select data_t based on the size of T
+    static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
+    using data_v = data_t __attribute__((ext_vector_type(N)));
+    using type   = non_native_vector_base<T, N>;
+
+    union alignas(next_pow2(N * sizeof(T)))
+    {
+        data_v dN; // storage vector;
+        StaticallyIndexedArray<data_t, N> dxN;
+        StaticallyIndexedArray<T, N> dTxN;
+        StaticallyIndexedArray<data_v, 1> dNx1;
+    } data_;
+
+    __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{data_v(a)} {}
+    __host__ __device__ constexpr non_native_vector_base(T f)
+        : non_native_vector_base(bit_cast<data_t>(f))
+    {
+    }
+    __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
+    __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
+
+    __host__ __device__ constexpr operator data_v() const { return data_.dN; }
+    __host__ __device__ constexpr operator data_t() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dxN; // XXX this should cause an error
+        }
+    }
+    __host__ __device__ constexpr operator T() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dTxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dTxN; // XXX this should cause an error
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same_v<X, data_t>)
+        {
+            return data_.dxN;
+        }
+        else if constexpr(is_same_v<X, T>)
+        {
+            return data_.dTxN;
+        }
+        else if constexpr(is_same_v<X, data_v>)
+        {
+            return data_.dNx1;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same_v<X, data_t>)
+        {
+            return data_.dxN;
+        }
+        else if constexpr(is_same_v<X, T>)
+        {
+            return data_.dTxN;
+        }
+        else if constexpr(is_same_v<X, data_v>)
+        {
+            return data_.dNx1;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
 template <typename T, index_t N>
-struct non_native_vector_base
+struct scalar_type<non_native_vector_base<T, N>>;
+
+template <index_t N>
+struct scalar_type<non_native_vector_base<f8_ocp_t, N>>
 {
-    using type = non_native_vector_base<T, N>;
+    using type = typename non_native_vector_base<f8_ocp_t, N>::data_t;
+
+    static constexpr index_t vector_size = N;
+};
 
-    __host__ __device__ non_native_vector_base()            = default;
-    __host__ __device__ non_native_vector_base(const type&) = default;
-    __host__ __device__ non_native_vector_base(type&&)      = default;
-    __host__ __device__ ~non_native_vector_base()           = default;
+template <index_t N>
+struct scalar_type<non_native_vector_base<bf8_ocp_t, N>>
+{
+    using type = typename non_native_vector_base<bf8_ocp_t, N>::data_t;
 
-    T d[N];
+    static constexpr index_t vector_size = N;
 };
 
 // non-native vector_type implementation
 template <typename T>
 struct vector_type<T, 1, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using type = d1_t;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using type     = d1_nnv_t;
 
     union alignas(next_pow2(1 * sizeof(T)))
     {
         d1_t d1_;
         StaticallyIndexedArray<d1_t, 1> d1x1_;
+        d1_nnv_t d1_nnv_;
     } data_;
 
-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+    __host__ __device__ constexpr vector_type() : data_{d1_t{}} {}
 
     __host__ __device__ constexpr vector_type(type v) : data_{v} {}
 
     template <typename X>
     __host__ __device__ constexpr const auto& AsType() const
     {
-        static_assert(is_same<X, d1_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        return data_.d1x1_;
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x1_;
+        }
+        else
+        {
+            return err;
+        }
     }
 
     template <typename X>
     __host__ __device__ constexpr auto& AsType()
     {
-        static_assert(is_same<X, d1_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        return data_.d1x1_;
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x1_;
+        }
+        else
+        {
+            return err;
+        }
     }
 };
 
 template <typename T>
 struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using d2_t = non_native_vector_base<T, 2>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
 
     using type = d2_t;
 
@@ -1081,10 +1238,11 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr const auto& AsType() const
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x2_;
         }
@@ -1101,10 +1259,11 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr auto& AsType()
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x2_;
         }
@@ -1122,9 +1281,10 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
 template <typename T>
 struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using d2_t = non_native_vector_base<T, 2>;
-    using d4_t = non_native_vector_base<T, 4>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
 
     using type = d4_t;
 
@@ -1143,10 +1303,11 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr const auto& AsType() const
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x4_;
         }
@@ -1167,10 +1328,11 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr auto& AsType()
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x4_;
         }
@@ -1192,10 +1354,11 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
 template <typename T>
 struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using d2_t = non_native_vector_base<T, 2>;
-    using d4_t = non_native_vector_base<T, 4>;
-    using d8_t = non_native_vector_base<T, 8>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
+    using d8_t     = non_native_vector_base<T, 8>;
 
     using type = d8_t;
 
@@ -1215,11 +1378,12 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr const auto& AsType() const
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x8_;
         }
@@ -1244,11 +1408,12 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr auto& AsType()
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x8_;
         }
@@ -1274,11 +1439,12 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
 template <typename T>
 struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t  = T;
-    using d2_t  = non_native_vector_base<T, 2>;
-    using d4_t  = non_native_vector_base<T, 4>;
-    using d8_t  = non_native_vector_base<T, 8>;
-    using d16_t = non_native_vector_base<T, 16>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
+    using d8_t     = non_native_vector_base<T, 8>;
+    using d16_t    = non_native_vector_base<T, 16>;
 
     using type = d16_t;
 
@@ -1299,12 +1465,12 @@ struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr const auto& AsType() const
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x16_;
         }
@@ -1333,12 +1499,12 @@ struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>>
     template <typename X>
     __host__ __device__ constexpr auto& AsType()
     {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
                       "Something went wrong, please check src and dst types.");
 
-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
         {
             return data_.d1x16_;
         }
@@ -1632,20 +1798,70 @@ using int8x32_t = typename vector_type<int8_t, 32>::type;
 using int8x64_t = typename vector_type<int8_t, 64>::type;
 
 // f8
-using f8x2_t  = typename vector_type<f8_t, 2>::type;
-using f8x4_t  = typename vector_type<f8_t, 4>::type;
-using f8x8_t  = typename vector_type<f8_t, 8>::type;
-using f8x16_t = typename vector_type<f8_t, 16>::type;
-using f8x32_t = typename vector_type<f8_t, 32>::type;
-using f8x64_t = typename vector_type<f8_t, 64>::type;
+using f8x2_fnuz_t  = typename vector_type<f8_fnuz_t, 2>::type;
+using f8x4_fnuz_t  = typename vector_type<f8_fnuz_t, 4>::type;
+using f8x8_fnuz_t  = typename vector_type<f8_fnuz_t, 8>::type;
+using f8x16_fnuz_t = typename vector_type<f8_fnuz_t, 16>::type;
+using f8x32_fnuz_t = typename vector_type<f8_fnuz_t, 32>::type;
+using f8x64_fnuz_t = typename vector_type<f8_fnuz_t, 64>::type;
 
 // bf8
-using bf8x2_t  = typename vector_type<bf8_t, 2>::type;
-using bf8x4_t  = typename vector_type<bf8_t, 4>::type;
-using bf8x8_t  = typename vector_type<bf8_t, 8>::type;
-using bf8x16_t = typename vector_type<bf8_t, 16>::type;
-using bf8x32_t = typename vector_type<bf8_t, 32>::type;
-using bf8x64_t = typename vector_type<bf8_t, 64>::type;
+using bf8x2_fnuz_t  = typename vector_type<bf8_fnuz_t, 2>::type;
+using bf8x4_fnuz_t  = typename vector_type<bf8_fnuz_t, 4>::type;
+using bf8x8_fnuz_t  = typename vector_type<bf8_fnuz_t, 8>::type;
+using bf8x16_fnuz_t = typename vector_type<bf8_fnuz_t, 16>::type;
+using bf8x32_fnuz_t = typename vector_type<bf8_fnuz_t, 32>::type;
+using bf8x64_fnuz_t = typename vector_type<bf8_fnuz_t, 64>::type;
+
+// f8
+using f8x2_ocp_t  = typename vector_type<f8_ocp_t, 2>::type;
+using f8x4_ocp_t  = typename vector_type<f8_ocp_t, 4>::type;
+using f8x8_ocp_t  = typename vector_type<f8_ocp_t, 8>::type;
+using f8x16_ocp_t = typename vector_type<f8_ocp_t, 16>::type;
+using f8x32_ocp_t = typename vector_type<f8_ocp_t, 32>::type;
+using f8x64_ocp_t = typename vector_type<f8_ocp_t, 64>::type;
+
+// bf8
+using bf8x2_ocp_t  = typename vector_type<bf8_ocp_t, 2>::type;
+using bf8x4_ocp_t  = typename vector_type<bf8_ocp_t, 4>::type;
+using bf8x8_ocp_t  = typename vector_type<bf8_ocp_t, 8>::type;
+using bf8x16_ocp_t = typename vector_type<bf8_ocp_t, 16>::type;
+using bf8x32_ocp_t = typename vector_type<bf8_ocp_t, 32>::type;
+using bf8x64_ocp_t = typename vector_type<bf8_ocp_t, 64>::type;
+
+#if CK_FP8_TYPE_OCP
+// f8
+using f8x2_t  = f8x2_ocp_t;
+using f8x4_t  = f8x4_ocp_t;
+using f8x8_t  = f8x8_ocp_t;
+using f8x16_t = f8x16_ocp_t;
+using f8x32_t = f8x32_ocp_t;
+using f8x64_t = f8x64_ocp_t;
+
+// bf8
+using bf8x2_t  = bf8x2_ocp_t;
+using bf8x4_t  = bf8x4_ocp_t;
+using bf8x8_t  = bf8x8_ocp_t;
+using bf8x16_t = bf8x16_ocp_t;
+using bf8x32_t = bf8x32_ocp_t;
+using bf8x64_t = bf8x64_ocp_t;
+#elif CK_FP8_TYPE_FNUZ
+// f8
+using f8x2_t  = f8x2_fnuz_t;
+using f8x4_t  = f8x4_fnuz_t;
+using f8x8_t  = f8x8_fnuz_t;
+using f8x16_t = f8x16_fnuz_t;
+using f8x32_t = f8x32_fnuz_t;
+using f8x64_t = f8x64_fnuz_t;
+
+// bf8
+using bf8x2_t  = bf8x2_fnuz_t;
+using bf8x4_t  = bf8x4_fnuz_t;
+using bf8x8_t  = bf8x8_fnuz_t;
+using bf8x16_t = bf8x16_fnuz_t;
+using bf8x32_t = bf8x32_fnuz_t;
+using bf8x64_t = bf8x64_fnuz_t;
+#endif
 
 // u8
 using uint8x2_t  = typename vector_type<uint8_t, 2>::type;
@@ -1702,7 +1918,7 @@ struct NumericLimits<int4_t>
 #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 
 template <>
-struct NumericLimits<f8_t>
+struct NumericLimits<f8_fnuz_t>
 {
     // negative zero nan mode with exp bias = 8
     static constexpr uint8_t binary_min    = 0x08; // 0b00001000
@@ -1715,17 +1931,17 @@ struct NumericLimits<f8_t>
     // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
     // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=0
 
-    __host__ __device__ static constexpr f8_t Min() { return f8_t(binary_min); }
+    __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
 
-    __host__ __device__ static constexpr f8_t Max() { return f8_t(binary_max); }
+    __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
 
-    __host__ __device__ static constexpr f8_t Lowest() { return f8_t(binary_lowest); }
+    __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
 
-    __host__ __device__ static constexpr f8_t QuietNaN() { return f8_t(binary_qnan); }
+    __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
 };
 
 template <>
-struct NumericLimits<bf8_t>
+struct NumericLimits<bf8_fnuz_t>
 {
     // negative zero nan mode with exp bias = 16
     static constexpr uint8_t binary_min    = 0x04; // 0b00000100
@@ -1738,13 +1954,59 @@ struct NumericLimits<bf8_t>
     // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
     // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=
 
-    __host__ __device__ static constexpr bf8_t Min() { return bf8_t(binary_min); }
+    __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
 
-    __host__ __device__ static constexpr bf8_t Max() { return bf8_t(binary_max); }
+    __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
 
-    __host__ __device__ static constexpr bf8_t Lowest() { return bf8_t(binary_lowest); }
+    __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
 
-    __host__ __device__ static constexpr bf8_t QuietNaN() { return bf8_t(binary_qnan); }
+    __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
+};
+
+template <>
+struct NumericLimits<f8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000 = 2^-6
+    static constexpr uint8_t binary_max    = 0x7E; // 0b01111110 = 448
+    static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
+    static constexpr uint8_t binary_qnan   = 0x7F; // 0b01111111
+
+    __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr f8_ocp_t Lowest()
+    {
+        return bit_cast<f8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr f8_ocp_t QuietNaN()
+    {
+        return bit_cast<f8_ocp_t>(binary_qnan);
+    }
+};
+
+template <>
+struct NumericLimits<bf8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x04; // 0b00000100 = 2^-14
+    static constexpr uint8_t binary_max    = 0x7B; // 0b01111011 = 57344
+    static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
+    static constexpr uint8_t binary_qnan   = 0x7D; // 0b01111101
+
+    __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Lowest()
+    {
+        return bit_cast<bf8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
+    {
+        return bit_cast<bf8_ocp_t>(binary_qnan);
+    }
 };
 
 template <typename T>
@@ -1787,7 +2049,7 @@ struct NumericUtils<half_t>
 };
 
 template <>
-struct NumericUtils<f8_t>
+struct NumericUtils<f8_fnuz_t>
 {
     static constexpr int exp  = 4;
     static constexpr int mant = 3;
@@ -1796,13 +2058,28 @@ struct NumericUtils<f8_t>
 };
 
 template <>
-struct NumericUtils<bf8_t>
+struct NumericUtils<bf8_fnuz_t>
 {
     static constexpr int exp  = 5;
     static constexpr int mant = 2;
     static constexpr int bias = 16; // negative zero nan mode
     // static constexpr int bias = 15; // ieee mode
 };
+template <>
+struct NumericUtils<f8_ocp_t>
+{
+    static constexpr int exp  = 4;
+    static constexpr int mant = 3;
+    static constexpr int bias = 7;
+};
+
+template <>
+struct NumericUtils<bf8_ocp_t>
+{
+    static constexpr int exp  = 5;
+    static constexpr int mant = 2;
+    static constexpr int bias = 15;
+};
 
 template <>
 struct NumericUtils<bhalf_t>
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index b374c4ad5..a6c3540d8 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -80,7 +80,7 @@ static inline __host__ bool isnan(half_t x)
     return (xx & 0x7FFF) > 0x7C00;
 };
 
-static inline __host__ bool isnan(f8_t x) { return (x & 0x80); };
+static inline __host__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); };
 
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 static inline __host__ bool isnan(int4_t x)
@@ -531,7 +531,7 @@ static inline __device__ bool isnan(half_t x)
     return (xx & 0x7FFF) > 0x7C00;
 };
 
-static inline __device__ bool isnan(f8_t x) { return (x & 0x80); };
+static inline __device__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); };
 
 static inline __device__ half_t sqrt(half_t x)
 {
diff --git a/include/ck/utility/random_gen.hpp b/include/ck/utility/random_gen.hpp
index b7edf2650..4ea52f7eb 100644
--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include "ck/ck.hpp"
+
 namespace ck {
 
 // Pseudo random number generator
@@ -23,7 +25,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }
 
 // version for fp16
-template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
     uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
@@ -38,9 +40,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }
 
 // return 0 if data is not fp16 or fp32
-template <typename T,
-          uint32_t seed_t,
-          std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
+template <
+    typename T,
+    uint32_t seed_t,
+    std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
     std::ignore = id;
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 87fa9aa38..f372756e6 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -9,7 +9,7 @@
 #include "ck/utility/array.hpp"
 
 namespace ck {
-// Define the common macro for gfx94x models
+// Define the common macro for MI300 models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif
@@ -100,6 +100,18 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
     return type_convert<bhalf_t>(x_fp32);
 }
 
+template <>
+inline __host__ __device__ constexpr f8_ocp_t type_convert<f8_ocp_t, int>(int x)
+{
+    return f8_ocp_t{type_convert<f8_ocp_t::data_type>(x)};
+}
+
+template <>
+inline __host__ __device__ constexpr bf8_ocp_t type_convert<bf8_ocp_t, int>(int x)
+{
+    return bf8_ocp_t{type_convert<bf8_ocp_t::data_type>(x)};
+}
+
 // Convert X to Y
 template <typename Y, typename X>
 __host__ __device__ constexpr Y type_convert_sp(X x)
@@ -163,7 +175,7 @@ __host__ __device__ constexpr Y f8_convert_sr(X x);
 
 // convert fp32 to fp8 with stochastic rounding
 template <>
-inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
+inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, float>(float x)
 {
     constexpr int seed = 1254739;
     uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
@@ -189,33 +201,35 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
     return utils::
-        cast_to_f8<float, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(x,
-                                                                                               rng);
+        cast_to_f8<float, f8_fnuz_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+            x, rng);
 #endif
 }
 
 // convert fp16 to fp8 with stochastic rounding
 template <>
-inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
     // convert to float and use native converion
-    return f8_convert_sr<f8_t>(type_convert<float>(x));
+    return f8_convert_sr<f8_fnuz_t>(type_convert<float>(x));
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
     constexpr int seed               = 1254739;
     uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
-    return utils::
-        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             f8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }
 
 // convert fp32 to bf8 with stochastic rounding
 template <>
-inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, float>(float x)
 {
     constexpr int seed = 1254739;
     uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
@@ -240,28 +254,32 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
-    return utils::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<float,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }
 
 // convert fp16 to bf8 with stochastic rounding
 template <>
-inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
     // convert to float and use native converion
-    return f8_convert_sr<bf8_t>(type_convert<float>(x));
+    return f8_convert_sr<bf8_fnuz_t>(type_convert<float>(x));
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
     constexpr int seed               = 1254739;
     uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
-    return utils::
-        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }
 
@@ -271,7 +289,7 @@ __host__ __device__ constexpr Y f8_convert_rne(X x);
 
 // convert fp32 to fp8 with rounding to nearest even
 template <>
-inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
+inline __host__ __device__ f8_fnuz_t f8_convert_rne<f8_fnuz_t, float>(float x)
 {
 #if defined(__gfx94__)
     union
@@ -296,32 +314,34 @@ inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
     constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
     constexpr uint32_t rng           = 0;
     return utils::
-        cast_to_f8<float, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(x,
-                                                                                               rng);
+        cast_to_f8<float, f8_fnuz_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+            x, rng);
 #endif
 }
 
 // convert fp16 to fp8 with rounding to nearest even
 template <>
-inline __host__ __device__ f8_t f8_convert_rne<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t f8_convert_rne<f8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
     // convert to float and use native converion
-    return f8_convert_rne<f8_t>(type_convert<float>(x));
+    return f8_convert_rne<f8_fnuz_t>(type_convert<float>(x));
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
     constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             f8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }
 
 // convert fp32 to bf8 with rounding to nearest even
 template <>
-inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, float>(float x)
 {
 #if defined(__gfx94__)
     union
@@ -345,44 +365,59 @@ inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
     constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<float,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }
 
 // convert fp16 to bf8 with rounding to nearest even
 template <>
-inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
     // convert to float and use native converion
-    return f8_convert_rne<bf8_t>(type_convert<float>(x));
+    return f8_convert_rne<bf8_fnuz_t>(type_convert<float>(x));
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
     constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
+#endif
+}
+
+// convert fp32 to fp8
+template <>
+inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_fnuz_t>(x);
+#else
+    return f8_convert_rne<f8_fnuz_t>(x);
 #endif
 }
 
 // convert fp32 to fp8
 template <>
-inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, float>(float x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_t>(x);
+    return f8_convert_sr<f8_ocp_t>(x);
 #else
-    return f8_convert_rne<f8_t>(x);
+    return f8_convert_rne<f8_ocp_t>(x);
 #endif
 }
 
 // convert fp8 to fp32
 template <>
-inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
+inline __host__ __device__ float type_convert<float, f8_fnuz_t>(f8_fnuz_t x)
 {
 #if defined(__gfx94__)
     float fval;
@@ -392,30 +427,44 @@ inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
     return fval;
 #else
     constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<f8_t, float, negative_zero_nan>(x);
+    return utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(x);
 #endif
 }
 
 template <>
-inline __host__ __device__ float2_t type_convert<float2_t, f8x2_t>(f8x2_t x)
+inline __host__ __device__ float2_t type_convert<float2_t, f8x2_fnuz_t>(f8x2_fnuz_t x)
 {
 #if defined(__gfx94__)
     const auto i16val = bit_cast<uint16_t>(x);
     return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, 0);
 #else
     constexpr bool negative_zero_nan = true;
-    const auto f8x2_v                = vector_type<f8_t, 2>(x);
+    const auto f8x2_v                = vector_type<f8_fnuz_t, 2>(x);
     vector_type<float, 2> f32x2_v;
     f32x2_v.template AsType<float>()(Number<0>{}) =
-        utils::cast_from_f8<f8_t, float, negative_zero_nan>(
-            f8x2_v.template AsType<f8_t>()[Number<0>{}]);
+        utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(
+            f8x2_v.template AsType<f8_fnuz_t>()[Number<0>{}]);
     f32x2_v.template AsType<float>()(Number<1>{}) =
-        utils::cast_from_f8<f8_t, float, negative_zero_nan>(
-            f8x2_v.template AsType<f8_t>()[Number<1>{}]);
+        utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(
+            f8x2_v.template AsType<f8_fnuz_t>()[Number<1>{}]);
     return f32x2_v.template AsType<float2_t>()[Number<0>{}];
 #endif
 }
 
+template <>
+inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    return fp8_impl::cast_to_f32x2_from_f8x2<f8_ocp_t::default_interpret>(
+        x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
+#else
+    return float2_t{fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<0>{}]),
+                    fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<1>{}])};
+#endif
+}
+
 template <>
 inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
 {
@@ -428,42 +477,64 @@ inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
 
 // convert fp16 to fp8
 template <>
-inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, half_t>(half_t x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_t>(x);
+    return f8_convert_sr<f8_fnuz_t>(x);
 #else
-    return f8_convert_rne<f8_t>(x);
+    return f8_convert_rne<f8_fnuz_t>(x);
+#endif
+}
+
+// convert fp16 to fp8
+template <>
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_ocp_t>(x);
+#else
+    return f8_convert_rne<f8_ocp_t>(x);
 #endif
 }
 
 // convert fp8 to fp16
 template <>
-inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
+inline __host__ __device__ half_t type_convert<half_t, f8_fnuz_t>(f8_fnuz_t x)
 {
 #if defined(__gfx94__)
     // use native conversion to float and convert to fp16
     return type_convert<half_t>(type_convert<float>(x));
 #else
     constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<f8_t, half_t, negative_zero_nan>(x);
+    return utils::cast_from_f8<f8_fnuz_t, half_t, negative_zero_nan>(x);
+#endif
+}
+
+// convert fp32 to bf8
+template <>
+inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_fnuz_t>(x);
+#else
+    return f8_convert_rne<bf8_fnuz_t>(x);
 #endif
 }
 
 // convert fp32 to bf8
 template <>
-inline __host__ __device__ bf8_t type_convert<bf8_t, float>(float x)
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, float>(float x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_t>(x);
+    return f8_convert_sr<bf8_ocp_t>(x);
 #else
-    return f8_convert_rne<bf8_t>(x);
+    return f8_convert_rne<bf8_ocp_t>(x);
 #endif
 }
 
 // convert bf8 to fp32
 template <>
-inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
+inline __host__ __device__ float type_convert<float, bf8_fnuz_t>(bf8_fnuz_t x)
 {
 #if defined(__gfx94__)
     float fval;
@@ -473,31 +544,42 @@ inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
     return fval;
 #else
     constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<bf8_t, float, negative_zero_nan>(x);
+    return utils::cast_from_f8<bf8_fnuz_t, float, negative_zero_nan>(x);
+#endif
+}
+
+// convert fp16 to bf8
+template <>
+inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_fnuz_t>(x);
+#else
+    return f8_convert_rne<bf8_fnuz_t>(x);
 #endif
 }
 
 // convert fp16 to bf8
 template <>
-inline __host__ __device__ bf8_t type_convert<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, half_t>(half_t x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_t>(x);
+    return f8_convert_sr<bf8_ocp_t>(x);
 #else
-    return f8_convert_rne<bf8_t>(x);
+    return f8_convert_rne<bf8_ocp_t>(x);
 #endif
 }
 
 // convert bf8 to fp16
 template <>
-inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
+inline __host__ __device__ half_t type_convert<half_t, bf8_fnuz_t>(bf8_fnuz_t x)
 {
 #if defined(__gfx94__)
     // use native conversion to float and convert to fp16
     return type_convert<half_t>(type_convert<float>(x));
 #else
     constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<bf8_t, half_t, negative_zero_nan>(x);
+    return utils::cast_from_f8<bf8_fnuz_t, half_t, negative_zero_nan>(x);
 #endif
 }
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index e1edc4fae..1ae11fe9d 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -62,9 +62,9 @@ struct ReferenceGemm : public device::BaseOperator
             auto f_mk_kn_mn = [&](auto m, auto n) {
                 const int K = arg.a_m_k_.mDesc.GetLengths()[1];
 
-                AccDataType v_acc = 0;
-                ComputeTypeA v_a  = 0;
-                ComputeTypeB v_b  = 0;
+                AccDataType v_acc{0};
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
 
                 for(int k = 0; k < K; ++k)
                 {
@@ -93,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator
                         ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                 }
 
-                CDataType v_c = 0;
+                CDataType v_c{0};
 
                 arg.c_element_op_(v_c, v_acc);
 
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 2c0b6c7b7..dd023e6b5 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -62,7 +62,7 @@ function(add_instance_library INSTANCE_NAME)
     endforeach()
     # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
     foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha")
+	    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha")
          message("removing mha instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
@@ -346,7 +346,7 @@ if(CK_DEVICE_CONV_INSTANCES)
 endif()
 if(CK_DEVICE_MHA_INSTANCES)
         set(gpu_list ${INST_TARGETS})
-        if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
+	if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
             add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES})
             add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
             target_compile_features(device_mha_operations PUBLIC)
diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
index af31cf8a8..e31433cc8 100644
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
@@ -15,7 +15,7 @@ void add_device_pool3d_fwd_ndhwc_f8_instances(
         instances)
 {
     add_device_operation_instances(
-        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, false>{});
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
 }
 
 void add_device_pool3d_fwd_ndhwc_index_f8_instances(
@@ -23,7 +23,7 @@ void add_device_pool3d_fwd_ndhwc_index_f8_instances(
         instances)
 {
     add_device_operation_instances(
-        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, true>{});
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, true>{});
 }
 
 } // namespace instance
diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
index 5bee67c1c..be69b67b5 100644
--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -150,7 +150,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
         break;
     default:
         a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
         d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
     }
diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
index f3d2c5561..b585b7d56 100644
--- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -157,7 +157,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
         break;
     default:
         a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
index 15a21206c..700ada73a 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -174,7 +174,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
         break;
     default:
         a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
index f2fcb0b13..e3c462e21 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -140,7 +140,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
         break;
     default:
         a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
         b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
     }
 
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
index 0419ccd8e..1373dbc49 100644
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -74,8 +74,8 @@ int profile_gemm_impl(int do_verification,
     switch(init_method)
     {
     case 0:
-        ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
-        ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
+        ck::utils::FillConstant<ADataType>{type_convert<ADataType>(1.f)}(a_m_k);
+        ck::utils::FillConstant<BDataType>{type_convert<BDataType>(1.f)}(b_k_n);
         break;
     case 1:
         ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt
index a783be7bb..a9d3dad7f 100644
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -9,13 +9,38 @@ if (USE_BITINT_EXTENSION_INT4)
   endif()
 endif()
 
-add_gtest_executable(test_fp8 test_fp8.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_fp8 PRIVATE utility)
+
+
+add_custom_target(test_fp8)
+
+if (CK_USE_OCP_FP8)
+  add_gtest_executable(test_fp8_ocp test_fp8_ocp.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_fp8_ocp PRIVATE utility)
+  endif()
+
+  add_gtest_executable(test_bf8_ocp test_bf8_ocp.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_bf8_ocp PRIVATE utility)
+  endif()
+
+  add_dependencies(test_fp8 test_fp8_ocp)
+  add_dependencies(test_fp8 test_bf8_ocp)
 endif()
-add_gtest_executable(test_bf8 test_bf8.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_bf8 PRIVATE utility)
+
+if (CK_USE_FNUZ_FP8)
+  add_gtest_executable(test_fp8_fnuz test_fp8_fnuz.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_fp8_fnuz PRIVATE utility)
+  endif()
+
+  add_gtest_executable(test_bf8_fnuz test_bf8_fnuz.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_bf8_fnuz PRIVATE utility)
+  endif()
+
+  add_dependencies(test_fp8 test_fp8_fnuz)
+  add_dependencies(test_fp8 test_bf8_fnuz)
 endif()
 
 add_gtest_executable(test_custom_type test_custom_type.cpp)
diff --git a/test/data_type/test_bf8.cpp b/test/data_type/test_bf8_fnuz.cpp
similarity index 52%
rename from test/data_type/test_bf8.cpp
rename to test/data_type/test_bf8_fnuz.cpp
index 6f50db68c..4ff796a61 100644
--- a/test/data_type/test_bf8.cpp
+++ b/test/data_type/test_bf8_fnuz.cpp
@@ -5,158 +5,169 @@
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 
-using ck::bf8_t;
+using ck::bf8_fnuz_t;
 using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::half_t;
 using ck::type_convert;
 
-TEST(BF8, NumericLimits)
+TEST(BF8FNUZ, NumericLimits)
 {
     // constants given for negative zero nan mode
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::Min(), type_convert<bf8_t>(0x04));
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::Max(), type_convert<bf8_t>(0x7F));
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::Lowest(), type_convert<bf8_t>(0xFF));
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::QuietNaN(), type_convert<bf8_t>(0x80));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::Min(), type_convert<bf8_fnuz_t>(0x04));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::Max(), type_convert<bf8_fnuz_t>(0x7F));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::Lowest(), type_convert<bf8_fnuz_t>(0xFF));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(), type_convert<bf8_fnuz_t>(0x80));
 }
 
-TEST(BF8, ConvertFP32Nearest)
+TEST(BF8FNUZ, ConvertFP32Nearest)
 {
     // fix the tolerance value
     float abs_tol = 1e-6;
     // convert 0 float to bf8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(0.0f)), abs_tol);
     // don't run the next test on gfx11 devices
 #ifndef CK_SKIP_FLAKY_F8_TEST
     // convert minimal float to bf8 and back, check if holds
     ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::min())),
                 abs_tol);
 #endif
-    // convert maximal bf8_t to float and check if equal to 57344.0
-    ASSERT_NEAR(57344.0f, type_convert<float>(f8_convert_rne<bf8_t>(57344.0f)), abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to float and check if equal to 57344.0
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(max_bf8_t_float)), abs_tol);
     // convert maximal float to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(57344.0f,
-                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::max())),
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
-    // convert inf float to bf8_t and check if it is qNan
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_rne<bf8_t>(std::numeric_limits<float>::infinity()),
+    // convert inf float to bf8_fnuz_t and check if it is qNan
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::infinity()),
                 abs_tol);
     // positive norm float value to bf8 and back, check if holds
     float pos_float = 0.0000762939f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(pos_float)), abs_tol);
     // negative norm float value to bf8 and back, check if holds
     float neg_float = -0.0000610351f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(neg_float)), abs_tol);
     // positive subnorm float value to bf8 and back, check if holds
     pos_float = 0.0000305175f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(pos_float)), abs_tol);
     // negative subnorm float value to bf8 and back, check if holds
     neg_float = -0.0000152587f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(neg_float)), abs_tol);
 }
 
-TEST(BF8, ConvertFP32Stochastic)
+TEST(BF8FNUZ, ConvertFP32Stochastic)
 {
     // fix the tolerance value
     float abs_tol = 1e-6;
     // convert 0 float to bf8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<bf8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(0.0f)), abs_tol);
     // convert minimal float to bf8 and back, check if holds
     ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(f8_convert_sr<bf8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::min())),
                 abs_tol);
-    // convert maximal bf8_t to float and check if equal to 57344.0
-    ASSERT_NEAR(57344.0f, type_convert<float>(f8_convert_sr<bf8_t>(57344.0f)), abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to float and check if equal to 57344.0
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(max_bf8_t_float)), abs_tol);
     // convert maximal float to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(57344.0f,
-                type_convert<float>(f8_convert_sr<bf8_t>(std::numeric_limits<float>::max())),
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
-    // convert inf float to bf8_t and check if it is qNan
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_sr<bf8_t>(std::numeric_limits<float>::infinity()),
+    // convert inf float to bf8_fnuz_t and check if it is qNan
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::infinity()),
                 abs_tol);
     // positive norm float value to bf8 and back, check if holds
     float pos_float = 0.0000762939f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(pos_float)), abs_tol);
     // negative norm float value to bf8 and back, check if holds
     float neg_float = -0.0000610351f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(neg_float)), abs_tol);
     // positive subnorm float value to bf8 and back, check if holds
     pos_float = 0.0000305175f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(pos_float)), abs_tol);
     // negative subnorm float value to bf8 and back, check if holds
     neg_float = -0.0000152587f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(neg_float)), abs_tol);
 }
 
-TEST(BF8, ConvertFP16Nearest)
+TEST(BF8FNUZ, ConvertFP16Nearest)
 {
     // fix the tolerance value
     float abs_tol = 1e-3;
     // convert 0 fp16 to bf8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(
+        half_t{0.0}, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(half_t{0.0})), abs_tol);
     // convert minimal fp16 to bf8 and back, check if holds
     ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::Min())),
                 abs_tol);
-    // convert maximal bf8_t to fp16 and check if equal to 57344.0
+
+    const auto max_bf8_t_half = type_convert<half_t>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0
     ASSERT_NEAR(
-        half_t{57344.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{57344.0})), abs_tol);
+        max_bf8_t_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(max_bf8_t_half)), abs_tol);
     // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(half_t{57344.0},
-                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Max())),
+    ASSERT_NEAR(max_bf8_t_half,
+                type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
-    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+    // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
                 abs_tol);
     // positive norm fp16 value to bf8 and back, check if holds
     half_t pos_half = half_t{0.0000762939};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(pos_half)), abs_tol);
     // negative norm fp16 value to bf8 and back, check if holds
     half_t neg_half = half_t{-0.0000610351};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(neg_half)), abs_tol);
     // positive subnorm fp16 value to bf8 and back, check if holds
     pos_half = half_t{0.0000305175};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(pos_half)), abs_tol);
     // negative subnorm fp16 value to bf8 and back, check if holds
     neg_half = half_t{-0.0000152587};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(neg_half)), abs_tol);
 }
 
-TEST(BF8, ConvertFP16Stochastic)
+TEST(BF8FNUZ, ConvertFP16Stochastic)
 {
     // fix the tolerance value
     float abs_tol = 1e-3;
     // convert 0 fp16 to bf8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<bf8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(half_t{0.0})), abs_tol);
     // convert minimal fp16 to bf8 and back, check if holds
     ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::Min())),
                 abs_tol);
-    // convert maximal bf8_t to fp16 and check if equal to 57344.0
+
+    const auto max_bf8_t_half = type_convert<half_t>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0
     ASSERT_NEAR(
-        half_t{57344.0}, type_convert<half_t>(f8_convert_sr<bf8_t>(half_t{57344.0})), abs_tol);
+        max_bf8_t_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(max_bf8_t_half)), abs_tol);
     // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(half_t{57344.0},
-                type_convert<half_t>(f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::Max())),
+    ASSERT_NEAR(max_bf8_t_half,
+                type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
-    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+    // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
                 abs_tol);
     // positive norm fp16 value to bf8 and back, check if holds
     half_t pos_half = half_t{0.0000762939};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(pos_half)), abs_tol);
     // negative norm fp16 value to bf8 and back, check if holds
     half_t neg_half = half_t{-0.0000610351};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(neg_half)), abs_tol);
     // positive subnorm fp16 value to bf8 and back, check if holds
     pos_half = half_t{0.0000305175};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(pos_half)), abs_tol);
     // negative subnorm fp16 value to bf8 and back, check if holds
     neg_half = half_t{-0.0000152587};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(neg_half)), abs_tol);
 }
diff --git a/test/data_type/test_bf8_ocp.cpp b/test/data_type/test_bf8_ocp.cpp
new file mode 100644
index 000000000..9d4ee38b1
--- /dev/null
+++ b/test/data_type/test_bf8_ocp.cpp
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+
+using ck::bf8_ocp_t;
+using ck::f8_convert_rne;
+using ck::f8_convert_sr;
+using ck::half_t;
+using ck::type_convert;
+
+TEST(BF8OCP, NumericLimits)
+{ // constants given for OCP FP8
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::Min(),
+              type_convert<bf8_ocp_t>(0x04)); // 0b00000100 = 2^-14
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::Max(),
+              type_convert<bf8_ocp_t>(0x7B)); // 0b01111011 = 57344
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::Lowest(),
+              type_convert<bf8_ocp_t>(0xFB)); // 0b11111011 = -57344
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::QuietNaN().data,
+              type_convert<bf8_ocp_t>(0x7D).data); // 0b01111101
+    EXPECT_FALSE(ck::NumericLimits<bf8_ocp_t>::QuietNaN() ==
+                 ck::NumericLimits<bf8_ocp_t>::QuietNaN());
+    EXPECT_TRUE(ck::fp8_is_inf(type_convert<bf8_ocp_t>(0xFC)) &&
+                ck::fp8_is_inf(type_convert<bf8_ocp_t>(0x7C)));
+}
+
+TEST(BF8OCP, ConvertFP32Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+
+    // convert 0 float to bfp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_ocp_t>(0.0f)), 0.0f);
+
+    // convert minimal float to bf8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to float and check if equal to bf8 max
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_rne<bf8_ocp_t>(max_bf8_t_float)), 0.0f);
+
+    // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::max())),
+                0.0f);
+
+    // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(ck::NumericLimits<bf8_ocp_t>::Max(),
+              f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::infinity()));
+
+    // positive normal float value to bf8 and back, check if holds
+    float pos_float = 0.0000762939f; // 10*2^-17
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_ocp_t>(pos_float)), abs_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14
+    ASSERT_NEAR(neg_min_bf8, type_convert<float>(f8_convert_rne<bf8_ocp_t>(neg_min_bf8)), 0.0f);
+
+    // positive subnorm float value to bf8 and back, check if holds
+    constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15
+    ASSERT_NEAR(
+        pos_subnorm_bf8, type_convert<float>(f8_convert_rne<bf8_ocp_t>(pos_subnorm_bf8)), 0.0f);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16
+    ASSERT_NEAR(
+        min_subnorm_bf8, type_convert<float>(f8_convert_rne<bf8_ocp_t>(min_subnorm_bf8)), 0.0f);
+
+    // smaller than min subnorm bf8 value to bf8 must be zero
+    constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17
+    ASSERT_EQ(0.0f, type_convert<float>(f8_convert_rne<bf8_ocp_t>(less_than_min_subnorm)));
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::quiet_NaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}
+
+TEST(BF8OCP, ConvertFP32Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+
+    // convert 0 float to bfp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<bf8_ocp_t>(0.0f)), 0.0f);
+
+    // convert minimal float to bf8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to float and check if equal to bf8 max
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_sr<bf8_ocp_t>(max_bf8_t_float)), 0.0f);
+
+    // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::max())),
+                0.0f);
+
+    // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(ck::NumericLimits<bf8_ocp_t>::Max(),
+              f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::infinity()));
+
+    // positive normal float value to bf8 and back, check if holds
+    float pos_float = 0.0000762939f; // 10*2^-17
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_ocp_t>(pos_float)), abs_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14
+    ASSERT_NEAR(neg_min_bf8, type_convert<float>(f8_convert_sr<bf8_ocp_t>(neg_min_bf8)), 0.0f);
+
+    // positive subnorm float value to bf8 and back, check if holds
+    constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15
+    ASSERT_NEAR(
+        pos_subnorm_bf8, type_convert<float>(f8_convert_sr<bf8_ocp_t>(pos_subnorm_bf8)), 0.0f);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16
+    ASSERT_NEAR(
+        min_subnorm_bf8, type_convert<float>(f8_convert_sr<bf8_ocp_t>(min_subnorm_bf8)), 0.0f);
+
+    // smaller than min subnorm bf8 value to bf8  alternates between 0 and 2^-16
+    constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17
+    ASSERT_NEAR(0.0f,
+                type_convert<float>(f8_convert_sr<bf8_ocp_t>(less_than_min_subnorm)),
+                0.0000152587890625f);
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::quiet_NaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}
+
+TEST(BF8OCP, ConvertFP16Nearest)
+{
+    // fix the tolerance value
+    constexpr half_t half_t_tol  = 1e-3;
+    constexpr half_t half_t_zero = 0.0;
+
+    // convert 0 half_t to bfp8 and back, check if holds
+    ASSERT_NEAR(
+        half_t_zero, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(half_t_zero)), half_t_zero);
+
+    // convert minimal half_t to bf8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(ck::NumericLimits<half_t>::Min())),
+                half_t_tol);
+
+    const auto max_bf8_t_half_t = type_convert<half_t>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(max_bf8_t_half_t)),
+                half_t_zero);
+
+    // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(ck::NumericLimits<half_t>::Max())),
+                half_t_zero);
+
+    // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(
+        ck::NumericLimits<bf8_ocp_t>::Max(),
+        f8_convert_rne<bf8_ocp_t>(type_convert<half_t>(std::numeric_limits<float>::infinity())));
+
+    // positive normal bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17
+    ASSERT_NEAR(
+        pos_norm_bf8, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(pos_norm_bf8)), half_t_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14
+    ASSERT_NEAR(
+        neg_min_bf8, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(neg_min_bf8)), half_t_zero);
+
+    // positive subnorm bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15
+    ASSERT_NEAR(pos_subnorm_bf8,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(pos_subnorm_bf8)),
+                half_t_zero);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    constexpr half_t min_subnorm_bf8{-0.0000152587890625f}; //-2^-16
+    ASSERT_NEAR(min_subnorm_bf8,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(min_subnorm_bf8)),
+                half_t_zero);
+
+    // smaller than min subnorm bf8 value to bf8 must be zero
+    constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17
+    ASSERT_EQ(half_t_zero, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(less_than_min_subnorm)));
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_rne<bf8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}
+
+TEST(BF8OCP, ConvertFP16Stochastic)
+{
+    // fix the tolerance value
+    constexpr half_t half_t_tol    = 1e-3;
+    constexpr half_t half_t_zero   = 0.0;
+    constexpr auto min_subnorm_bf8 = 0.0000152587890625f; // 2^-16
+
+    // convert 0 half_t to bfp8 and back, check if holds
+    ASSERT_NEAR(
+        half_t_zero, type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(half_t_zero)), half_t_zero);
+
+    // convert minimal half_t (6.103515625e-05) to fp8 and back
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::Min())),
+                half_t_zero);
+
+    const auto max_bf8_t_half_t = type_convert<half_t>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(max_bf8_t_half_t)),
+                half_t_zero);
+
+    // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::Max())),
+                half_t_zero);
+
+    // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(
+        ck::NumericLimits<bf8_ocp_t>::Max(),
+        f8_convert_sr<bf8_ocp_t>(type_convert<half_t>(std::numeric_limits<float>::infinity())));
+
+    // positive normal bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17
+    ASSERT_NEAR(
+        pos_norm_bf8, type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(pos_norm_bf8)), half_t_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14
+    ASSERT_NEAR(
+        neg_min_bf8, type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(neg_min_bf8)), half_t_zero);
+
+    // positive subnorm bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15
+    ASSERT_NEAR(pos_subnorm_bf8,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(pos_subnorm_bf8)),
+                half_t_zero);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    ASSERT_NEAR(half_t{-min_subnorm_bf8},
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(half_t{-min_subnorm_bf8})),
+                half_t_zero);
+
+    // smaller than min subnorm bf8 value to bf8  alternates between 0 and 2^-16
+    constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17
+    ASSERT_NEAR(half_t_zero,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(less_than_min_subnorm)),
+                half_t{min_subnorm_bf8});
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}
diff --git a/test/data_type/test_custom_type.cpp b/test/data_type/test_custom_type.cpp
index 101681254..a8fa9ba4a 100644
--- a/test/data_type/test_custom_type.cpp
+++ b/test/data_type/test_custom_type.cpp
@@ -872,3 +872,161 @@ TEST(Complex_half, TestAsTypeReshape)
                   test_vec.at(num_elem * i + 1));
     });
 }
+
+#if CK_USE_OCP_FP8
+
+TEST(FP8OCP, TestSize)
+{
+    static_assert(std::is_same_v<f8_t, ck::f8_ocp_t>, "OCP FP8 is not enabled");
+    ASSERT_EQ(sizeof(f8_t), sizeof(ck::fp8_storage_t));
+    ASSERT_EQ(sizeof(vector_type<f8_t, 2>), sizeof(vector_type<ck::fp8_storage_t, 2>));
+    ASSERT_EQ(sizeof(vector_type<f8_t, 4>), sizeof(vector_type<ck::fp8_storage_t, 4>));
+    ASSERT_EQ(sizeof(vector_type<f8_t, 8>), sizeof(vector_type<ck::fp8_storage_t, 8>));
+    ASSERT_EQ(sizeof(vector_type<f8_t, 16>), sizeof(vector_type<ck::fp8_storage_t, 16>));
+    ASSERT_EQ(sizeof(vector_type<f8_t, 32>), sizeof(vector_type<ck::fp8_storage_t, 32>));
+    ASSERT_EQ(sizeof(vector_type<f8_t, 64>), sizeof(vector_type<ck::fp8_storage_t, 64>));
+}
+
+TEST(FP8OCP, TestAsType)
+{
+    static_assert(std::is_same_v<f8_t, ck::f8_ocp_t>, "OCP FP8 is not enabled");
+
+    // test size
+    std::array<float, 8> test_vec = {-4, -2, -0.5, -0.25, 1.0 / 8.0, 1, 1.5, 16};
+    constexpr int size            = test_vec.size();
+
+    // reference vector
+    vector_type<f8_t, size> right_vec;
+
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<f8_t>()(Number<i>{}), f8_t{0}); });
+
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<f8_t>()(Number<i>{}) = ck::type_convert<f8_t>(test_vec.at(i));
+    });
+
+    // copy the vector
+    vector_type<f8_t, size> left_vec{right_vec};
+
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<f8_t>()(Number<i>{}),
+                  ck::type_convert<f8_t>(test_vec.at(i)));
+    });
+
+    ck::non_native_vector_base<ck::f8_ocp_t, 2> nnvb_f8x2(ck::type_convert<f8_t>(-10.0f));
+    ASSERT_EQ(nnvb_f8x2.template AsType<f8_t>()(Number<0>{}), ck::type_convert<f8_t>(-10.0f));
+    ASSERT_EQ(nnvb_f8x2.template AsType<f8_t>()(Number<1>{}), ck::type_convert<f8_t>(-10.0f));
+}
+
+TEST(FP8OCP, TestAsTypeReshape)
+{
+    static_assert(std::is_same_v<f8_t, ck::f8_ocp_t>, "OCP FP8 is not enabled");
+
+    // test size
+    std::array<float, 8> test_vec = {-8, -0.5, -0.25, 1.0 / 8.0, 1 / 256, 1, 1.5, 16};
+    constexpr int size            = test_vec.size();
+
+    // reference vector
+    vector_type<f8_t, size> right_vec;
+
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<f8_t>()(Number<i>{}), f8_t{0}); });
+
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<f8_t>()(Number<i>{}) = ck::type_convert<f8_t>(test_vec.at(i));
+    });
+
+    // copy the first half of a vector
+    vector_type<f8_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<f8_t, size / 2>::type>()(Number<0>{})};
+
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<f8_t>()(Number<i>{}),
+                  ck::type_convert<f8_t>(test_vec.at(i)));
+    });
+}
+
+TEST(BF8OCP, TestSize)
+{
+    static_assert(std::is_same_v<bf8_t, ck::bf8_ocp_t>, "OCP BF8 is not enabled");
+    ASSERT_EQ(sizeof(bf8_t), sizeof(ck::fp8_storage_t));
+    ASSERT_EQ(sizeof(vector_type<bf8_t, 2>), sizeof(vector_type<ck::fp8_storage_t, 2>));
+    ASSERT_EQ(sizeof(vector_type<bf8_t, 4>), sizeof(vector_type<ck::fp8_storage_t, 4>));
+    ASSERT_EQ(sizeof(vector_type<bf8_t, 8>), sizeof(vector_type<ck::fp8_storage_t, 8>));
+    ASSERT_EQ(sizeof(vector_type<bf8_t, 16>), sizeof(vector_type<ck::fp8_storage_t, 16>));
+    ASSERT_EQ(sizeof(vector_type<bf8_t, 32>), sizeof(vector_type<ck::fp8_storage_t, 32>));
+    ASSERT_EQ(sizeof(vector_type<bf8_t, 64>), sizeof(vector_type<ck::fp8_storage_t, 64>));
+}
+
+TEST(BF8OCP, TestAsType)
+{
+    static_assert(std::is_same_v<bf8_t, ck::bf8_ocp_t>, "OCP BF8 is not enabled");
+
+    // test size
+    std::array<float, 8> test_vec = {-4, -2, -0.5, -0.25, 1.0 / 8.0, 1, 1.5, 16};
+    constexpr int size            = test_vec.size();
+
+    // reference vector
+    vector_type<bf8_t, size> right_vec;
+
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<bf8_t>()(Number<i>{}), bf8_t{0}); });
+
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<bf8_t>()(Number<i>{}) = ck::type_convert<bf8_t>(test_vec.at(i));
+    });
+
+    // copy the vector
+    vector_type<bf8_t, size> left_vec{right_vec};
+
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<bf8_t>()(Number<i>{}),
+                  ck::type_convert<bf8_t>(test_vec.at(i)));
+    });
+
+    ck::non_native_vector_base<bf8_t, 2> nnvb_bf8x2(ck::type_convert<bf8_t>(-10.0f));
+    ASSERT_EQ(nnvb_bf8x2.template AsType<bf8_t>()(Number<0>{}), ck::type_convert<bf8_t>(-10.0f));
+    ASSERT_EQ(nnvb_bf8x2.template AsType<bf8_t>()(Number<1>{}), ck::type_convert<bf8_t>(-10.0f));
+}
+
+TEST(BF8OCP, TestAsTypeReshape)
+{
+    static_assert(std::is_same_v<bf8_t, ck::bf8_ocp_t>, "OCP BF8 is not enabled");
+
+    // test size
+    std::array<float, 8> test_vec = {-8, -0.5, -0.25, 1.0 / 8.0, 1 / 256, 1, 1.5, 16};
+    constexpr int size            = test_vec.size();
+
+    // reference vector
+    vector_type<bf8_t, size> right_vec;
+
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<bf8_t>()(Number<i>{}), bf8_t{0}); });
+
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<bf8_t>()(Number<i>{}) = ck::type_convert<bf8_t>(test_vec.at(i));
+    });
+
+    // copy the first half of a vector
+    vector_type<bf8_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<bf8_t, size / 2>::type>()(Number<0>{})};
+
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<bf8_t>()(Number<i>{}),
+                  ck::type_convert<bf8_t>(test_vec.at(i)));
+    });
+}
+
+#endif
diff --git a/test/data_type/test_fp8.cpp b/test/data_type/test_fp8_fnuz.cpp
similarity index 52%
rename from test/data_type/test_fp8.cpp
rename to test/data_type/test_fp8_fnuz.cpp
index 25d9d9d2f..c2ec6dad9 100644
--- a/test/data_type/test_fp8.cpp
+++ b/test/data_type/test_fp8_fnuz.cpp
@@ -7,154 +7,171 @@
 
 using ck::f8_convert_rne;
 using ck::f8_convert_sr;
-using ck::f8_t;
+using ck::f8_fnuz_t;
 using ck::half_t;
 using ck::type_convert;
 
-TEST(FP8, NumericLimits)
+TEST(FP8FNUZ, NumericLimits)
 {
     // constants given for negative zero nan mode
-    EXPECT_EQ(ck::NumericLimits<f8_t>::Min(), type_convert<f8_t>(0x08));
-    EXPECT_EQ(ck::NumericLimits<f8_t>::Max(), type_convert<f8_t>(0x7F));
-    EXPECT_EQ(ck::NumericLimits<f8_t>::Lowest(), type_convert<f8_t>(0xFF));
-    EXPECT_EQ(ck::NumericLimits<f8_t>::QuietNaN(), type_convert<f8_t>(0x80));
+    EXPECT_EQ(ck::NumericLimits<f8_fnuz_t>::Min(), type_convert<f8_fnuz_t>(0x08));
+    EXPECT_EQ(ck::NumericLimits<f8_fnuz_t>::Max(), type_convert<f8_fnuz_t>(0x7F));
+    EXPECT_EQ(ck::NumericLimits<f8_fnuz_t>::Lowest(), type_convert<f8_fnuz_t>(0xFF));
+    EXPECT_EQ(ck::NumericLimits<f8_fnuz_t>::QuietNaN(), type_convert<f8_fnuz_t>(0x80));
 }
 
-TEST(FP8, ConvertFP32Nearest)
+TEST(FP8FNUZ, ConvertFP32Nearest)
 {
     // fix the tolerance value
     float abs_tol = 1e-6;
     // convert 0 float to fp8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<f8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<f8_fnuz_t>(0.0f)), abs_tol);
     // don't run the next test on gfx11 devices
 #ifndef CK_SKIP_FLAKY_F8_TEST
     // convert minimal float to fp8 and back, check if holds
     ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(f8_convert_rne<f8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_rne<f8_fnuz_t>(std::numeric_limits<float>::min())),
                 abs_tol);
 #endif
-    // convert maximal f8_t to float and check if equal to 240.0
-    ASSERT_NEAR(240.0f, type_convert<float>(f8_convert_rne<f8_t>(240.0f)), abs_tol);
-    // convert maximal float to fp8 and back, check if clipped to 240.0
-    ASSERT_NEAR(240.0f,
-                type_convert<float>(f8_convert_rne<f8_t>(std::numeric_limits<float>::max())),
+
+    const auto max_f8_t_float = type_convert<float>(ck::NumericLimits<f8_fnuz_t>::Max());
+    // convert maximal f8_fnuz_t to float and check if equal to fp8 max
+    ASSERT_NEAR(
+        max_f8_t_float, type_convert<float>(f8_convert_rne<f8_fnuz_t>(max_f8_t_float)), abs_tol);
+
+    // XXX: FNUZ f8_convert_rne behavior is inconsistent.
+    // Clipping large values to fp8 max (saturation to finite) contradicts converting inf float to
+    // fp8 qNAN (no saturation).
+
+    // convert maximal float to fp8 and back, check if clipped to fp8 max
+    ASSERT_NEAR(max_f8_t_float,
+                type_convert<float>(f8_convert_rne<f8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
-    // convert inf float to f8_t and check if it is qNan
-    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                f8_convert_rne<f8_t>(std::numeric_limits<float>::infinity()),
+    // convert inf float to f8_fnuz_t and check if it is qNan
+    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+                f8_convert_rne<f8_fnuz_t>(std::numeric_limits<float>::infinity()),
                 abs_tol);
     // positive norm float value to fp8 and back, check if holds
     float pos_float = 0.017578125f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_fnuz_t>(pos_float)), abs_tol);
     // negative norm float value to fp8 and back, check if holds
     float neg_float = -0.015625f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_fnuz_t>(neg_float)), abs_tol);
     // positive subnorm float value to fp8 and back, check if holds
     pos_float = 0.00390625f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_fnuz_t>(pos_float)), abs_tol);
     // negative subnorm float value to fp8 and back, check if holds
     neg_float = -0.001953125f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_fnuz_t>(neg_float)), abs_tol);
 }
 
-TEST(FP8, ConvertFP32Stochastic)
+TEST(FP8FNUZ, ConvertFP32Stochastic)
 {
     // fix the tolerance value
     float abs_tol = 1e-6;
     // convert 0 float to fp8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<f8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<f8_fnuz_t>(0.0f)), abs_tol);
     // convert minimal float to fp8 and back, check if holds
     ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(f8_convert_sr<f8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_sr<f8_fnuz_t>(std::numeric_limits<float>::min())),
                 abs_tol);
-    // convert maximal f8_t to float and check if equal to 240.0
-    ASSERT_NEAR(240.0f, type_convert<float>(f8_convert_sr<f8_t>(240.0f)), abs_tol);
-    // convert maximal float to fp8 and back, check if clipped to 240.0
-    ASSERT_NEAR(240.0f,
-                type_convert<float>(f8_convert_sr<f8_t>(std::numeric_limits<float>::max())),
+
+    const auto max_f8_t_float = type_convert<float>(ck::NumericLimits<f8_fnuz_t>::Max());
+    // convert maximal f8_fnuz_t to float and check if equal to fp8 max
+    ASSERT_NEAR(
+        max_f8_t_float, type_convert<float>(f8_convert_sr<f8_fnuz_t>(max_f8_t_float)), abs_tol);
+    // convert maximal float to fp8 and back, check if clipped to fp8 max
+    ASSERT_NEAR(max_f8_t_float,
+                type_convert<float>(f8_convert_sr<f8_fnuz_t>(std::numeric_limits<float>::max())),
                 abs_tol);
-    // convert inf float to f8_t and check if it is qNan
-    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                f8_convert_sr<f8_t>(std::numeric_limits<float>::infinity()),
+    // convert inf float to f8_fnuz_t and check if it is qNan
+    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+                f8_convert_sr<f8_fnuz_t>(std::numeric_limits<float>::infinity()),
                 abs_tol);
     // positive norm float value to fp8 and back, check if holds
     float pos_float = 0.017578125f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_fnuz_t>(pos_float)), abs_tol);
     // negative norm float value to fp8 and back, check if holds
     float neg_float = -0.015625f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_fnuz_t>(neg_float)), abs_tol);
     // positive subnorm float value to fp8 and back, check if holds
     pos_float = 0.00390625f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_fnuz_t>(pos_float)), abs_tol);
     // negative subnorm float value to fp8 and back, check if holds
     neg_float = -0.001953125f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_fnuz_t>(neg_float)), abs_tol);
 }
 
-TEST(FP8, ConvertFP16Nearest)
+TEST(FP8FNUZ, ConvertFP16Nearest)
 {
     // fix the tolerance value
     float abs_tol = 1e-3;
     // convert 0 fp16 to fp8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<f8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(half_t{0.0})), abs_tol);
     // convert minimal fp16 to fp8 and back, check if holds
     ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(ck::NumericLimits<half_t>::Min())),
                 abs_tol);
-    // convert maximal f8_t to fp16 and check if equal to 240.0
-    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(f8_convert_rne<f8_t>(half_t{240.0})), abs_tol);
-    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
-    ASSERT_NEAR(half_t{240.0},
-                type_convert<half_t>(f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::Max())),
+
+    const auto max_f8_t_half = type_convert<half_t>(ck::NumericLimits<f8_fnuz_t>::Max());
+    // convert maximal f8_fnuz_t to fp16 and check if equal to fp8 max
+    ASSERT_NEAR(
+        max_f8_t_half, type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(max_f8_t_half)), abs_tol);
+    // convert maximal fp16 to fp8 and back, check if clipped to fp8 max
+    ASSERT_NEAR(max_f8_t_half,
+                type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
-    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
-    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+    // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN
+    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+                f8_convert_rne<f8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
                 abs_tol);
     // positive norm fp16 value to fp8 and back, check if holds
     half_t pos_half = half_t{0.017578125};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(pos_half)), abs_tol);
     // negative norm fp16 value to fp8 and back, check if holds
     half_t neg_half = half_t{-0.015625};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(neg_half)), abs_tol);
     // positive subnorm fp16 value to fp8 and back, check if holds
     pos_half = half_t{0.00390625};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(pos_half)), abs_tol);
     // negative subnorm fp16 value to fp8 and back, check if holds
     neg_half = half_t{-0.001953125};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_fnuz_t>(neg_half)), abs_tol);
 }
 
-TEST(FP8, ConvertFP16Stochastic)
+TEST(FP8FNUZ, ConvertFP16Stochastic)
 {
     // fix the tolerance value
     float abs_tol = 1e-3;
     // convert 0 fp16 to fp8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<f8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(half_t{0.0})), abs_tol);
     // convert minimal fp16 to fp8 and back, check if holds
     ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(ck::NumericLimits<half_t>::Min())),
                 abs_tol);
-    // convert maximal f8_t to fp16 and check if equal to 240.0
-    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(f8_convert_sr<f8_t>(half_t{240.0})), abs_tol);
-    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
-    ASSERT_NEAR(half_t{240.0},
-                type_convert<half_t>(f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::Max())),
+
+    const auto max_f8_t_half = type_convert<half_t>(ck::NumericLimits<f8_fnuz_t>::Max());
+    // convert maximal f8_fnuz_t to fp16 and check if equal to fp8 max
+    ASSERT_NEAR(
+        max_f8_t_half, type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(max_f8_t_half)), abs_tol);
+    // convert maximal fp16 to fp8 and back, check if clipped to fp8 max
+    ASSERT_NEAR(max_f8_t_half,
+                type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                 abs_tol);
-    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
-    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                f8_convert_sr<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+    // convert QuietNaN fp16 to f8_fnuz_t and check if it is QuietNaN
+    ASSERT_NEAR(ck::NumericLimits<f8_fnuz_t>::QuietNaN(),
+                f8_convert_sr<f8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
                 abs_tol);
     // positive norm fp16 value to fp8 and back, check if holds
     half_t pos_half = half_t{0.017578125};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(pos_half)), abs_tol);
     // negative norm fp16 value to fp8 and back, check if holds
     half_t neg_half = half_t{-0.015625};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(neg_half)), abs_tol);
     // positive subnorm fp16 value to fp8 and back, check if holds
     pos_half = half_t{0.00390625};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(pos_half)), abs_tol);
     // negative subnorm fp16 value to fp8 and back, check if holds
     neg_half = half_t{-0.001953125};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<f8_fnuz_t>(neg_half)), abs_tol);
 }
diff --git a/test/data_type/test_fp8_ocp.cpp b/test/data_type/test_fp8_ocp.cpp
new file mode 100644
index 000000000..a8077f1bd
--- /dev/null
+++ b/test/data_type/test_fp8_ocp.cpp
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+
+using ck::f8_convert_rne;
+using ck::f8_convert_sr;
+using ck::f8_ocp_t;
+using ck::half_t;
+using ck::type_convert;
+
+TEST(FP8OCP, NumericLimits)
+{
+    // constants given for OCP FP8
+    EXPECT_EQ(ck::NumericLimits<f8_ocp_t>::Min(),
+              type_convert<f8_ocp_t>(0x08)); // 0b00001000 = 2^-6
+    EXPECT_EQ(ck::NumericLimits<f8_ocp_t>::Max(), type_convert<f8_ocp_t>(0x7E)); // 0b01111110 = 448
+    EXPECT_EQ(ck::NumericLimits<f8_ocp_t>::Lowest(),
+              type_convert<f8_ocp_t>(0xFE)); // 0b11111110 = -448
+    EXPECT_EQ(ck::NumericLimits<f8_ocp_t>::QuietNaN().data,
+              type_convert<f8_ocp_t>(0x7F).data); // 0b01111111
+    EXPECT_FALSE(ck::NumericLimits<f8_ocp_t>::QuietNaN() ==
+                 ck::NumericLimits<f8_ocp_t>::QuietNaN());
+}
+
+TEST(FP8OCP, ConvertFP32Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to fp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<f8_ocp_t>(0.0f)), 0.0f);
+
+    // convert minimal float to fp8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_rne<f8_ocp_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+
+    const auto max_f8_t_float = type_convert<float>(ck::NumericLimits<f8_ocp_t>::Max());
+
+    // convert maximal f8_ocp_t to float and check if equal to fp8 max
+    ASSERT_NEAR(
+        max_f8_t_float, type_convert<float>(f8_convert_rne<f8_ocp_t>(max_f8_t_float)), 0.0f);
+
+    // convert maximal float to fp8 and back, check if clipped to fp8 max (saturation to finite)
+    ASSERT_NEAR(max_f8_t_float,
+                type_convert<float>(f8_convert_rne<f8_ocp_t>(std::numeric_limits<float>::max())),
+                0.0f);
+
+    // convert float infinity to f8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(ck::NumericLimits<f8_ocp_t>::Max(),
+              f8_convert_rne<f8_ocp_t>(std::numeric_limits<float>::infinity()));
+
+    // positive norm float value to fp8 and back, check if holds
+    float pos_float = 0.017578125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_ocp_t>(pos_float)), abs_tol);
+
+    // smallest normal fp8 value to fp8 and back, check if holds
+    float neg_float = -0.015625f; //-2^-6
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_ocp_t>(neg_float)), 0.0f);
+
+    // positive subnorm float value to fp8 and back, check if holds
+    pos_float = 0.00390625f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_ocp_t>(pos_float)), abs_tol);
+
+    // min subnorm fp8 value to fp8 and back, check if holds
+    neg_float = -0.001953125f; //-2^-9
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_ocp_t>(neg_float)), 0.0f);
+
+    // smaller than min subnorm fp8 value to fp8 must be zero
+    auto less_than_min_subnorm = 0.0009765625f; // 2^-10
+    ASSERT_EQ(0.0f, type_convert<float>(f8_convert_rne<f8_ocp_t>(less_than_min_subnorm)));
+
+    // convert quiet NaN to f8_ocp_t and check if it is quiet NaN
+    auto f8_nan = f8_convert_rne<f8_ocp_t>(std::numeric_limits<float>::quiet_NaN());
+    ASSERT_TRUE((f8_nan.data & 0x7f) == 0x7f);
+}
+
+TEST(FP8OCP, ConvertFP32Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+    // convert 0 float to fp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<f8_ocp_t>(0.0f)), 0.0f);
+
+    // convert minimal float to fp8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_sr<f8_ocp_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+
+    const auto max_f8_t_float = type_convert<float>(ck::NumericLimits<f8_ocp_t>::Max());
+
+    // convert maximal f8_ocp_t to float and check if equal to fp8 max
+    ASSERT_NEAR(max_f8_t_float, type_convert<float>(f8_convert_sr<f8_ocp_t>(max_f8_t_float)), 0.0f);
+
+    // convert maximal float to fp8 and back, check if clipped to fp8 max (saturation to finite)
+    ASSERT_NEAR(max_f8_t_float,
+                type_convert<float>(f8_convert_sr<f8_ocp_t>(std::numeric_limits<float>::max())),
+                0.0f);
+
+    // convert float infinity to f8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(ck::NumericLimits<f8_ocp_t>::Max(),
+              f8_convert_sr<f8_ocp_t>(std::numeric_limits<float>::infinity()));
+
+    // positive norm float value to fp8 and back, check if holds
+    float pos_float = 0.017578125f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_ocp_t>(pos_float)), abs_tol);
+
+    // smallest normal fp8 value to fp8 and back, check if holds
+    float neg_float = -0.015625f; //-2^-6
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<f8_ocp_t>(neg_float)), 0.0f);
+
+    // positive subnorm float value to fp8 and back, check if holds
+    pos_float = 0.00390625f;
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<f8_ocp_t>(pos_float)), abs_tol);
+
+    // min subnorm fp8 value to fp8 and back, check if holds
+    constexpr auto min_subnorm_fp8 = -0.001953125f; //-2^-9
+    ASSERT_NEAR(
+        min_subnorm_fp8, type_convert<float>(f8_convert_sr<f8_ocp_t>(min_subnorm_fp8)), 0.0f);
+
+    // smaller than min subnorm fp8 value to fp8 alternates between 0 and 2^-9
+    auto less_than_min_subnorm = 0.0009765625f; // 2^-10
+    ASSERT_NEAR(
+        0.0f, type_convert<float>(f8_convert_sr<f8_ocp_t>(less_than_min_subnorm)), 0.001953125f);
+
+    // convert quiet NaN to f8_ocp_t and check if it is quiet NaN
+    auto f8_nan = f8_convert_sr<f8_ocp_t>(std::numeric_limits<float>::quiet_NaN());
+    ASSERT_TRUE((f8_nan.data & 0x7f) == 0x7f);
+}
+
+TEST(FP8OCP, ConvertFP16Nearest)
+{
+    // fix the tolerance value
+    constexpr half_t half_t_tol  = 1e-3;
+    constexpr half_t half_t_zero = 0.0;
+    // convert 0 half_t to fp8 and back, check if holds
+    ASSERT_NEAR(
+        half_t_zero, type_convert<half_t>(f8_convert_rne<f8_ocp_t>(half_t_zero)), half_t_zero);
+
+    // convert minimal half_t to fp8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_rne<f8_ocp_t>(ck::NumericLimits<half_t>::Min())),
+                half_t_tol);
+    const auto max_f8_t_half_t = type_convert<half_t>(ck::NumericLimits<f8_ocp_t>::Max());
+
+    // convert maximal f8_ocp_t to half_t and check if equal to fp8 max
+    ASSERT_NEAR(max_f8_t_half_t,
+                type_convert<half_t>(f8_convert_rne<f8_ocp_t>(max_f8_t_half_t)),
+                half_t_zero);
+
+    // convert maximal half_t to fp8 and back, check if clipped to fp8 max (saturation to finite)
+    ASSERT_NEAR(max_f8_t_half_t,
+                type_convert<half_t>(f8_convert_rne<f8_ocp_t>(ck::NumericLimits<half_t>::Max())),
+                half_t_zero);
+
+    // convert half_t infinity to f8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(
+        ck::NumericLimits<f8_ocp_t>::Max(),
+        f8_convert_rne<f8_ocp_t>(type_convert<half_t>(std::numeric_limits<float>::infinity())));
+
+    // positive norm half_t value to fp8 and back, check if holds
+    half_t pos_half_t{0.017578125f};
+    ASSERT_NEAR(pos_half_t, type_convert<half_t>(f8_convert_rne<f8_ocp_t>(pos_half_t)), half_t_tol);
+
+    // smallest normal fp8 value to fp8 and back, check if holds
+    half_t neg_half_t{-0.015625f}; //-2^-6
+    ASSERT_NEAR(
+        neg_half_t, type_convert<half_t>(f8_convert_rne<f8_ocp_t>(neg_half_t)), half_t_zero);
+
+    // positive subnorm half_t value to fp8 and back, check if holds
+    pos_half_t = half_t{0.00390625f};
+    ASSERT_NEAR(pos_half_t, type_convert<half_t>(f8_convert_rne<f8_ocp_t>(pos_half_t)), half_t_tol);
+
+    // min subnorm fp8 value to fp8 and back, check if holds
+    neg_half_t = half_t{-0.001953125f}; //-2^-9
+    ASSERT_NEAR(
+        neg_half_t, type_convert<half_t>(f8_convert_rne<f8_ocp_t>(neg_half_t)), half_t_zero);
+
+    // smaller than min subnorm fp8 value to fp8 must be zero
+    auto less_than_min_subnorm = half_t{0.0009765625f}; // 2^-10
+    ASSERT_EQ(half_t_zero, type_convert<half_t>(f8_convert_rne<f8_ocp_t>(less_than_min_subnorm)));
+
+    // convert quiet NaN to f8_ocp_t and check if it is quiet NaN
+    auto f8_nan = f8_convert_rne<f8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data));
+}
+
+TEST(FP8OCP, ConvertFP16Stochastic)
+{
+    // fix the tolerance value
+    constexpr half_t half_t_tol    = 1e-3;
+    constexpr half_t half_t_zero   = 0.0;
+    constexpr auto min_subnorm_fp8 = 0.001953125f; // 2^-9
+
+    // convert 0 half_t to fp8 and back, check if holds
+    ASSERT_NEAR(
+        half_t_zero, type_convert<half_t>(f8_convert_sr<f8_ocp_t>(half_t_zero)), half_t_zero);
+
+    // convert minimal half_t (6.103515625e-05) to fp8 and back
+    // alternates between 0 and 2^-9 (0.001953125)
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_sr<f8_ocp_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(min_subnorm_fp8));
+
+    const auto max_f8_t_half_t = type_convert<half_t>(ck::NumericLimits<f8_ocp_t>::Max());
+
+    // convert maximal f8_ocp_t to half_t and check if equal to fp8 max
+    ASSERT_NEAR(max_f8_t_half_t,
+                type_convert<half_t>(f8_convert_sr<f8_ocp_t>(max_f8_t_half_t)),
+                half_t_zero);
+
+    // convert maximal half_t to fp8 and back, check if clipped to fp8 max (saturation to finite)
+    ASSERT_NEAR(max_f8_t_half_t,
+                type_convert<half_t>(f8_convert_sr<f8_ocp_t>(ck::NumericLimits<half_t>::Max())),
+                half_t_zero);
+
+    // convert half_t infinity to f8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(
+        ck::NumericLimits<f8_ocp_t>::Max(),
+        f8_convert_sr<f8_ocp_t>(type_convert<half_t>(std::numeric_limits<float>::infinity())));
+
+    // positive norm half_t value to fp8 and back, check if holds
+    half_t pos_half_t{0.017578125f};
+    ASSERT_NEAR(pos_half_t, type_convert<half_t>(f8_convert_sr<f8_ocp_t>(pos_half_t)), half_t_tol);
+
+    // smallest normal fp8 value to fp8 and back, check if holds
+    half_t neg_half_t{-0.015625f}; //-2^-6
+    ASSERT_NEAR(neg_half_t, type_convert<half_t>(f8_convert_sr<f8_ocp_t>(neg_half_t)), half_t_zero);
+
+    // positive subnorm half_t value to fp8 and back, check if holds
+    pos_half_t = half_t{0.00390625f};
+    ASSERT_NEAR(pos_half_t, type_convert<half_t>(f8_convert_sr<f8_ocp_t>(pos_half_t)), half_t_tol);
+
+    // min subnorm fp8 value to fp8 and back, check if holds
+    neg_half_t = half_t{-min_subnorm_fp8}; //-2^-9
+    ASSERT_NEAR(neg_half_t, type_convert<half_t>(f8_convert_sr<f8_ocp_t>(neg_half_t)), half_t_zero);
+
+    // smaller than min subnorm fp8 value to fp8 alternates between 0 and 2^-9
+    auto less_than_min_subnorm = half_t{0.0009765625f}; // 2^-10
+    ASSERT_NEAR(
+        type_convert<float>(half_t_zero),
+        type_convert<float>(type_convert<half_t>(f8_convert_sr<f8_ocp_t>(less_than_min_subnorm))),
+        min_subnorm_fp8);
+
+    // convert quiet NaN to f8_ocp_t and check if it is quiet NaN
+    auto f8_nan = f8_convert_sr<f8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data));
+}
diff --git a/test/pool/test_avg_pool2d_fwd.cpp b/test/pool/test_avg_pool2d_fwd.cpp
index 8dbb37b84..b5e733419 100644
--- a/test/pool/test_avg_pool2d_fwd.cpp
+++ b/test/pool/test_avg_pool2d_fwd.cpp
@@ -138,7 +138,7 @@ TYPED_TEST_SUITE(AvgPool2D_BF16, AvgPool2D_BF16_Types);
 TYPED_TEST_SUITE(AvgPool2D_I8, AvgPool2D_I8_Types);
 TYPED_TEST_SUITE(AvgPool2D_F8, AvgPool2D_F8_Types);
 
-TYPED_TEST(AvgPool2D_F32, AvgPool2D_I8_Test) { this->Run(); }
+TYPED_TEST(AvgPool2D_F32, AvgPool2D_F32_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_F16, AvgPool2D_F16_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_BF16, AvgPool2D_BF16_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_I8, AvgPool2D_I8_Test) { this->Run(); }
diff --git a/test/pool/test_max_pool2d_fwd.cpp b/test/pool/test_max_pool2d_fwd.cpp
index 80ca47407..217924275 100644
--- a/test/pool/test_max_pool2d_fwd.cpp
+++ b/test/pool/test_max_pool2d_fwd.cpp
@@ -143,7 +143,7 @@ TYPED_TEST_SUITE(MaxPool2D_BF16, MaxPool2D_BF16_Types);
 TYPED_TEST_SUITE(MaxPool2D_I8, MaxPool2D_I8_Types);
 TYPED_TEST_SUITE(MaxPool2D_F8, MaxPool2D_F8_Types);
 
-TYPED_TEST(MaxPool2D_F32, MaxPool2D_I8_Test) { this->Run(); }
+TYPED_TEST(MaxPool2D_F32, MaxPool2D_F32_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_F16, MaxPool2D_F16_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_BF16, MaxPool2D_BF16_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_I8, MaxPool2D_I8_Test) { this->Run(); }
-- 
GitLab


From 5affda819de5624e83d8d90f883c0a87f80b7ee2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 4 Dec 2024 00:46:47 +0100
Subject: [PATCH 098/153] Add basic documentation structure (#1715)

* Add basic documentation structure

* Add terminology placeholder

* Add codegen placeholder

* Create template for each page
---
 CONTRIBUTORS.md                     |  1 +
 README.md                           | 34 ++++++++++++++---------------
 TERMINOLOGY.md                      |  2 ++
 client_example/25_wrapper/README.md | 11 +++-------
 client_example/README.md            |  2 ++
 codegen/README.md                   |  2 ++
 example/README.md                   |  2 ++
 include/ck/README.md                | 19 ++++++++++++++++
 include/ck_tile/README.md           |  3 ++-
 profiler/README.md                  | 12 ++++++++++
 10 files changed, 62 insertions(+), 26 deletions(-)
 create mode 100644 TERMINOLOGY.md
 create mode 100644 codegen/README.md
 create mode 100644 example/README.md
 create mode 100644 include/ck/README.md

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index cdce5a463..8ef5c2b72 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,3 +1,4 @@
+[Back to the main page](./README.md)
 # Composable Kernel Developers and Contributors
 
 This is the list of developers and contributors to Composable Kernel library
diff --git a/README.md b/README.md
index d8eb152ee..c0872aa56 100644
--- a/README.md
+++ b/README.md
@@ -26,23 +26,15 @@ The current CK library is structured into four layers:
 
 ## General information
 
-To build our documentation locally, use the following code:
-
-``` bash
-cd docs
-pip3 install -r sphinx/requirements.txt
-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
-```
-
-You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
-
-```note
-If you use CK, cite us as follows:
-
-* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
-  This paper will be available on arXiv soon.
-* [CITATION.cff](/CITATION.cff)
-```
+* [CK supported operations](include/ck/README.md)
+* [CK Tile supported operations](include/ck_tile/README.md)
+* [CK wrapper](client_example/25_wrapper/README.md)
+* [CK codegen](codegen/README.md)
+* [CK profiler](profiler/README.md)
+* [Examples (Custom use of CK supported operations)](example/README.md)
+* [Client examples (Use of CK supported operations with instance factory)](client_example/README.md)
+* [Terminology](/TERMINOLOGY.md)
+* [Contributors](/CONTRIBUTORS.md)
 
 CK is released under the **[MIT license](/LICENSE)**.
 
@@ -137,6 +129,14 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
 
     You can find instructions for running ckProfiler in [profiler](/profiler).
 
+* Build our documentation locally:
+
+    ``` bash
+    cd docs
+    pip3 install -r sphinx/requirements.txt
+    python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+    ```
+
 Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
 However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
 crash. On average, you should expect each thread to use ~2Gb of RAM.
diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md
new file mode 100644
index 000000000..e8833efb8
--- /dev/null
+++ b/TERMINOLOGY.md
@@ -0,0 +1,2 @@
+[Back to the main page](./README.md)
+# Composable Kernel terminology
\ No newline at end of file
diff --git a/client_example/25_wrapper/README.md b/client_example/25_wrapper/README.md
index eba3de017..3db9a9af4 100644
--- a/client_example/25_wrapper/README.md
+++ b/client_example/25_wrapper/README.md
@@ -1,14 +1,9 @@
+[Back to the main page](../../README.md)
 # Composable Kernel wrapper GEMM tutorial
 
-This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
-wrapper. We present the base version of GEMM without most of the available optimizations; however,
-it's worth noting that CK has kernels with different optimizations.
+This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations.
 
-To implement these optimizations, you can use the CK wrapper or directly use available instances in
-CK. You can also refer to the
-[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
-that uses CK wrapper based on the
-[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
+To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
 
 The kernel definition should look similar to:
 
diff --git a/client_example/README.md b/client_example/README.md
index 64a7130d5..d9f793434 100644
--- a/client_example/README.md
+++ b/client_example/README.md
@@ -1,3 +1,5 @@
+[Back to the main page](../README.md)
+# Composable Kernel client examples
 ##
 Client application links to CK library, and therefore CK library needs to be installed before building client applications.
 
diff --git a/codegen/README.md b/codegen/README.md
new file mode 100644
index 000000000..deadf3221
--- /dev/null
+++ b/codegen/README.md
@@ -0,0 +1,2 @@
+[Back to the main page](../README.md)
+# Composable Kernel codegen
\ No newline at end of file
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 000000000..43b3419f8
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,2 @@
+[Back to the main page](../README.md)
+# Composable Kernel examples
\ No newline at end of file
diff --git a/include/ck/README.md b/include/ck/README.md
new file mode 100644
index 000000000..bff689f6b
--- /dev/null
+++ b/include/ck/README.md
@@ -0,0 +1,19 @@
+[Back to the main page](../../README.md)
+# Composable Kernel supported operations
+## Supported device operations
+* [Average pooling]()
+* [Batched contraction]()
+* [Batched gemm]()
+* [Batchnorm]()
+* [CGEMM]()
+* [Contraction]()
+* [Convolution]()
+* [Image to Column and Column to Image]()
+* [Elementwise]()
+* [GEMM]()
+* [Max pooling]()
+* [Reduce]()
+* [Normalization]()
+* [Permute]()
+* [Put]()
+* [Softmax]()
diff --git a/include/ck_tile/README.md b/include/ck_tile/README.md
index 572e9c7e4..9f88af1ca 100644
--- a/include/ck_tile/README.md
+++ b/include/ck_tile/README.md
@@ -1,4 +1,5 @@
-# ck_tile
+[Back to the main page](../../README.md)
+# Composable Kernel Tile
 ## concept
 `ck_tile` provides a programming model with templated abstractions to enable users to implement performance-critical kernels for machine learning workloads. introduces following basic concepts to help users building your own operator
  - tensor coordinate transformation, this is the core concept of layout/index transform abstraction in both compiler time and run time.
diff --git a/profiler/README.md b/profiler/README.md
index 10febcabd..3f4837aad 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -1,3 +1,5 @@
+[Back to the main page](../README.md)
+# Composable Kernel profiler
 ## Profile GEMM kernels
 ```bash
 #arg1: tensor operation (gemm=GEMM)
@@ -180,3 +182,13 @@ Note: Column to image kernel adds to the output memory, this will cause output b
 ################            op datatype  verify  init  log  time  dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2
 ./bin/ckProfiler permute_scale        0       1     1    0     1    64   64   64       4096         64          1           1          64        4096
 ```
+
+## Convert MIOpen driver command to CKProfiler
+
+```bash
+python3 ../script/convert_miopen_driver_to_profiler.py
+/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
+-p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1 
+```
+
+Only convolution driver is supported.
-- 
GitLab


From 126ce85aa10347007fb5ca2068bcad378cb17d74 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Wed, 4 Dec 2024 15:59:58 +0800
Subject: [PATCH 099/153] [CK_TILE] Use 'false' for highest dimension padding
 flags (#1716)

* Use 'false' for highest dimension padding flags

* Update padding flag of bias
---
 .../ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp   | 15 +++++++--------
 .../kernel/fmha_fwd_splitkv_combine_kernel.hpp    |  2 +-
 .../ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp   | 15 +++++++--------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 3de433d6a..3a66b78a5 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -998,14 +998,14 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
             }
             else
             {
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
             }
         }();
         const auto k_dram = [&]() {
@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel
             return pad_tensor_view(
                 k_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK, kPadHeadDimQ>{});
+                sequence<false, kPadHeadDimQ>{});
         }();
         const auto v_dram = [&]() {
             if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     v_dram_transposed,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV, false>{});
             }
             else
             {
@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     v_dram_naive,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<false, kPadSeqLenK>{});
             }
         }();
 
@@ -1097,9 +1097,8 @@ struct FmhaFwdKernel
                         number<FmhaPipeline::kAlignmentBias>{},
                         number<1>{});
 
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        bias_dram_naive, bias_dram_window_lengths, sequence<false, kPadSeqLenK>{});
                 }();
 
                 return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
index ca9da91a5..0bccabdd2 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -339,7 +339,7 @@ struct FmhaFwdSplitKVCombineKernel
                 number<FmhaPipeline::kAlignmentOacc>{},
                 number<1>{});
 
-            auto o_acc_dram_view = pad_tensor_view(
+            const auto o_acc_dram_view = pad_tensor_view(
                 o_acc_dram_naive,
                 make_tuple(number<1>{}, number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
                 sequence<false, kPadSeqLenQ, kPadHeadDimV>{});
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index dcb671d81..f37e676da 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -623,14 +623,14 @@ struct FmhaFwdSplitKVKernel
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
             }
             else
             {
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
             }
         }();
 
@@ -645,7 +645,7 @@ struct FmhaFwdSplitKVKernel
             return pad_tensor_view(
                 k_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK, kPadHeadDimQ>{});
+                sequence<false, kPadHeadDimQ>{});
         };
         const auto k_dram = [&]() {
             if constexpr(kIsPagedKV)
@@ -678,7 +678,7 @@ struct FmhaFwdSplitKVKernel
                 return pad_tensor_view(
                     v_dram_transposed,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV, false>{});
             }
             else
             {
@@ -692,7 +692,7 @@ struct FmhaFwdSplitKVKernel
                 return pad_tensor_view(
                     v_dram_naive,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<false, kPadSeqLenK>{});
             }
         };
         const auto v_dram = [&]() {
@@ -804,9 +804,8 @@ struct FmhaFwdSplitKVKernel
                         number<FmhaPipeline::kAlignmentBias>{},
                         number<1>{});
 
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        bias_dram_naive, bias_dram_window_lengths, sequence<false, kPadSeqLenK>{});
                 }();
 
                 return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-- 
GitLab


From 4cb3d7d7eac162af2c6e1a1d9c3367cb7633347c Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Wed, 4 Dec 2024 21:40:01 +0100
Subject: [PATCH 100/153] Ck tile grouped GEMM example (#1713)

* Ck-tile, impl. grouped gemm

* Workspace is allocated by user, and is passed to the function

* Prepare test to new api design

* Unify GemTransKernelArgs, removing N0 param

* Add 1 to dim3 in paritioner

* Typo: gem - > gemm

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 .../ck_tile/17_grouped_gemm/CMakeLists.txt    |   2 +
 example/ck_tile/17_grouped_gemm/README.md     |  25 ++
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  | 151 +++++++++
 .../ck_tile/17_grouped_gemm/grouped_gemm.hpp  |  53 +++
 .../run_grouped_gemm_example.inc              | 191 +++++++++++
 example/ck_tile/17_grouped_gemm/utils.hpp     |  38 +++
 example/ck_tile/CMakeLists.txt                |   1 +
 .../core/utility/amd_address_space.hpp        |  37 +++
 include/ck_tile/ops/gemm.hpp                  |   1 +
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp |  36 ++
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   | 310 ++++++++++++++++++
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/grouped_gemm/CMakeLists.txt      |   4 +
 .../grouped_gemm/test_grouped_gemm.cpp        |  29 ++
 .../test_grouped_gemm_ut_cases.inc            |  25 ++
 .../grouped_gemm/test_grouped_gemm_util.hpp   | 282 ++++++++++++++++
 16 files changed, 1186 insertions(+)
 create mode 100644 example/ck_tile/17_grouped_gemm/CMakeLists.txt
 create mode 100644 example/ck_tile/17_grouped_gemm/README.md
 create mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
 create mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
 create mode 100644 example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
 create mode 100644 example/ck_tile/17_grouped_gemm/utils.hpp
 create mode 100644 include/ck_tile/core/utility/amd_address_space.hpp
 create mode 100644 include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
 create mode 100644 test/ck_tile/grouped_gemm/CMakeLists.txt
 create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
 create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
 create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp

diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
new file mode 100644
index 000000000..d34013dd6
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
+
diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md
new file mode 100644
index 000000000..d1a0458ed
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/README.md
@@ -0,0 +1,25 @@
+# Grouped CShuffle GEMM
+
+This folder contains example for Grouped GEMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile GEMM, but creates the placeholders for the future support on different GEMM pipeline and different GEMM modules. In the near future, we will gradually migrate all the GEMM features from old CK to CK Tile.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the gemm calculation
+make tile_example_grouped_gemm -j
+```
+This will result in an executable `build/bin/tile_example_grouped_gemm`
+
+## example
+```
+args:
+   -a_layout    Tensor A layout (default:R)
+   -b_layout    Tensor B layout (default:R)
+   -c_layout    Tensor C layout (default:R)
+          -v    0. No validation, 1. Validation on CPU
+     -warmup    number of iterations before benchmark the kernel (default:10)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+```
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
new file mode 100644
index 000000000..14f3b4a5b
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "grouped_gemm.hpp"
+#include "utils.hpp"
+
+namespace {
+
+struct GroupedGemmKernelParam
+{
+    static const bool kPadM        = false;
+    static const bool kPadN        = false;
+    static const bool kPadK        = false;
+    static const bool kTilePermute = false;
+
+    static const ck_tile::index_t kOutputRank = 2;
+
+    static const int kBlockPerCu         = 1;
+    static const ck_tile::index_t M_Tile = 128;
+    static const ck_tile::index_t N_Tile = 128;
+    static const ck_tile::index_t K_Tile = 32;
+
+    static const ck_tile::index_t M_Warp = 2;
+    static const ck_tile::index_t N_Warp = 2;
+    static const ck_tile::index_t K_Warp = 1;
+
+    static const ck_tile::index_t M_Warp_Tile = 32;
+    static const ck_tile::index_t N_Warp_Tile = 32;
+    static const ck_tile::index_t K_Warp_Tile = 8;
+};
+
+using CodegenGemmShape =
+    ck_tile::TileGemmShape<ck_tile::sequence<GroupedGemmKernelParam::M_Tile,
+                                             GroupedGemmKernelParam::N_Tile,
+                                             GroupedGemmKernelParam::K_Tile>,
+                           ck_tile::sequence<GroupedGemmKernelParam::M_Warp,
+                                             GroupedGemmKernelParam::N_Warp,
+                                             GroupedGemmKernelParam::K_Warp>,
+                           ck_tile::sequence<GroupedGemmKernelParam::M_Warp_Tile,
+                                             GroupedGemmKernelParam::N_Warp_Tile,
+                                             GroupedGemmKernelParam::K_Warp_Tile>>;
+
+using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+template <typename CLayout>
+using GemmEpilogue = std::conditional_t<
+    std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>,
+    ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                                               CDataType,
+                                                               GroupedGemmKernelParam::kPadM,
+                                                               GroupedGemmKernelParam::kPadN,
+                                                               GroupedGemmKernelParam::kTilePermute,
+                                                               GroupedGemmKernelParam::kOutputRank,
+                                                               1,
+                                                               0,
+                                                               TilePartitioner::MPerBlock,
+                                                               TilePartitioner::NPerBlock>>,
+    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<AccDataType,
+                                                                 CDataType,
+                                                                 GroupedGemmKernelParam::kPadM,
+                                                                 GroupedGemmKernelParam::kPadN>>>;
+
+template <typename ALayout, typename BLayout, typename CLayout>
+using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemmKernelParam::kPadM,
+                                                  GroupedGemmKernelParam::kPadN,
+                                                  GroupedGemmKernelParam::kPadK,
+                                                  ALayout,
+                                                  BLayout,
+                                                  CLayout>;
+
+template <typename ALayout, typename BLayout, typename CLayout>
+using CodegenPipelineProblem =
+    ck_tile::GemmPipelineProblem<ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CodegenGemmShape,
+                                 CodegenGemmTraits<ALayout, BLayout, CLayout>>;
+
+using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy;
+
+template <typename ALayout, typename BLayout, typename CLayout>
+using CodegenGemmPipeline =
+    ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>,
+                                          CodegenGemmPolicy>;
+
+template <typename ALayout, typename BLayout, typename CLayout>
+using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
+                                          CodegenGemmPipeline<ALayout, BLayout, CLayout>,
+                                          GemmEpilogue<CLayout>>;
+}; // namespace
+
+std::size_t GetWorkspaceSize(const std::vector<grouped_gemm_kargs>& gemm_descs)
+{
+    return ::Kernel<std::nullptr_t, std::nullptr_t, std::nullptr_t>::GetWorkSpaceSize(gemm_descs);
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                   const ck_tile::stream_config& s,
+                   void* p_workspace_)
+{
+    using GroupedGemmKernel = ::Kernel<ALayout, BLayout, CLayout>;
+
+    auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs);
+
+    const dim3 grids      = GroupedGemmKernel::GridSize(gemm_descs);
+    constexpr dim3 blocks = GroupedGemmKernel::BlockSize();
+
+    ck_tile::hip_check_error(hipMemcpyWithStream(
+        p_workspace_,
+        arguments.data(),
+        arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg),
+        hipMemcpyHostToDevice,
+        s.stream_id_));
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Launching kernel with args:"
+                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
+    }
+
+    float ave_time =
+        ck_tile::launch_kernel(s,
+                               ck_tile::make_kernel<blocks.x, GroupedGemmKernelParam::kBlockPerCu>(
+                                   GroupedGemmKernel{},
+                                   grids,
+                                   blocks,
+                                   0,
+                                   ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
+                                   gemm_descs.size()));
+    return ave_time;
+}
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
new file mode 100644
index 000000000..94af4711d
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+
+template <typename DataType>
+struct GemmBasicTypeConfig;
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using CDataType   = ck_tile::half_t;
+    using AccDataType = float;
+};
+
+using Types = GemmBasicTypeConfig<ck_tile::half_t>;
+
+// Specific type aliases for easy access
+using ADataType   = Types::ADataType;
+using BDataType   = Types::BDataType;
+using AccDataType = Types::AccDataType;
+using CDataType   = Types::CDataType;
+
+using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "R", "B tensor data layout - Row by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("validate", "1", "0. No validation, 1. Validation on CPU")
+        .insert("warmup", "10", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("group_count", "16", "group count");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+std::size_t GetWorkspaceSize(const std::vector<grouped_gemm_kargs>& gemm_descs);
+
+float grouped_gemm_calc(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                        const ck_tile::stream_config& s,
+                        void* p_workspace_);
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
new file mode 100644
index 000000000..cd5b1c286
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float invoke_gemm(int n_warmup,
+                  int n_repeat,
+                  int group_count,
+                  const std::vector<grouped_gemm_kargs>& args)
+{
+
+    ck_tile::DeviceMem gemm_workspace;
+    gemm_workspace.Realloc(GetWorkspaceSize(args));
+
+    float ave_time = grouped_gemm<ALayout, BLayout, CLayout>(
+        args,
+        ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
+        gemm_workspace.GetDeviceBuffer());
+
+    std::string op_name{"Grouped Gemm"};
+
+    std::size_t flop = 0, num_btype = 0;
+    for(int j = 0; j < group_count; ++j)
+    {
+        flop += std::size_t(2) * args[j].M * args[j].N * args[j].K;
+
+        num_btype += sizeof(ADataType) * args[j].M * args[j].K +
+                     sizeof(BDataType) * args[j].K * args[j].N +
+                     sizeof(CDataType) * args[j].M * args[j].N;
+    }
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+              << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+    return ave_time;
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+int run_grouped_gemm_example_with_layouts(int argc,
+                                          char* argv[],
+                                          const ALayout a_layout                  = ALayout{},
+                                          const BLayout b_layout                  = BLayout{},
+                                          [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+
+    if(!result)
+    {
+        return -1;
+    };
+
+    const int group_count = arg_parser.get_int("group_count");
+    const int repeat      = arg_parser.get_int("repeat");
+    const int warmup      = arg_parser.get_int("warmup");
+
+    std::vector<ck_tile::index_t> Ms;
+    std::vector<ck_tile::index_t> Ns;
+    std::vector<ck_tile::index_t> Ks;
+    std::vector<ck_tile::index_t> stride_As;
+    std::vector<ck_tile::index_t> stride_Bs;
+    std::vector<ck_tile::index_t> stride_Cs;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * i);
+        Ns.push_back(128 + 128 * i);
+        Ks.push_back(128 + 64 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Cs.push_back(Ns[i]);
+    }
+
+    std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+    std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+    std::vector<ck_tile::HostTensor<CDataType>> c_m_n_tensors;
+
+    a_m_k_tensors.reserve(group_count);
+    b_k_n_tensors.reserve(group_count);
+    c_m_n_tensors.reserve(group_count);
+
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+    std::vector<std::unique_ptr<ck_tile::DeviceMem>> c_m_n_dev_buf;
+
+    a_m_k_dev_buf.reserve(group_count);
+    b_k_n_dev_buf.reserve(group_count);
+    c_m_n_dev_buf.reserve(group_count);
+
+    std::vector<grouped_gemm_kargs> gemm_descs;
+    gemm_descs.reserve(group_count);
+
+    for(int i = 0; i < group_count; ++i)
+    {
+        const ck_tile::index_t M = Ms[i];
+        const ck_tile::index_t N = Ns[i];
+        const ck_tile::index_t K = Ks[i];
+
+        stride_As[i] = f_get_default_stride(M, N, stride_As[i], a_layout);
+        stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], b_layout);
+        stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{});
+
+        a_m_k_tensors.push_back(
+            ck_tile::HostTensor<ADataType>(f_host_tensor_descriptor(M, K, stride_As[i], a_layout)));
+        b_k_n_tensors.push_back(
+            ck_tile::HostTensor<BDataType>(f_host_tensor_descriptor(K, N, stride_Bs[i], b_layout)));
+        c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
+            f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{})));
+
+        std::cout << "gemm[" << i << "]"
+                  << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc
+                  << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl;
+
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tensors[i]);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors[i]);
+
+        a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+            a_m_k_tensors[i].get_element_space_size_in_bytes()));
+        b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+            b_k_n_tensors[i].get_element_space_size_in_bytes()));
+        c_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+            c_m_n_tensors[i].get_element_space_size_in_bytes()));
+
+        a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data());
+        b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data());
+        c_m_n_dev_buf[i]->SetZero();
+        c_m_n_tensors[i].SetZero();
+
+        const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer();
+        const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
+        void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
+
+        gemm_descs.push_back({p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+    }
+
+    invoke_gemm<ALayout, BLayout, CLayout>(warmup, repeat, group_count, gemm_descs);
+
+    for(int i = 0; i < group_count; i++)
+    {
+        c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data());
+    }
+
+    bool pass{true};
+    if(arg_parser.get_int("validate"))
+    {
+        for(int i = 0; i < group_count; ++i)
+        {
+            ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+                f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{}));
+            c_m_n_host_ref.SetZero();
+            ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+                a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
+            pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref);
+        }
+        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+int run_grouped_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+
+    const std::string a_layout = arg_parser.get_str("a_layout");
+    const std::string b_layout = arg_parser.get_str("b_layout");
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
diff --git a/example/ck_tile/17_grouped_gemm/utils.hpp b/example/ck_tile/17_grouped_gemm/utils.hpp
new file mode 100644
index 000000000..bb3cdf9fd
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/utils.hpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename TLayout>
+constexpr auto
+f_host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride, TLayout layout)
+{
+    using namespace ck_tile::literals;
+
+    if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+    {
+        return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+    }
+    else
+    {
+        return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+    }
+}
+template <typename TLayout>
+constexpr auto
+f_get_default_stride(std::size_t row, std::size_t col, std::size_t stride, TLayout layout)
+{
+    if(stride == 0)
+    {
+        if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return col;
+        }
+        else
+        {
+            return row;
+        }
+    }
+    else
+        return stride;
+}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 51ebb5bf0..296eb1ece 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -16,3 +16,4 @@ add_subdirectory(13_moe_sorting)
 add_subdirectory(14_moe_smoothquant)
 add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
+add_subdirectory(17_grouped_gemm)
diff --git a/include/ck_tile/core/utility/amd_address_space.hpp b/include/ck_tile/core/utility/amd_address_space.hpp
new file mode 100644
index 000000000..cb242bf0d
--- /dev/null
+++ b/include/ck_tile/core/utility/amd_address_space.hpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+
+// Address Space for AMDGCN
+// https://llvm.org/docs/AMDGPUUsage.html#address-space
+
+namespace ck_tile {
+
+#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
+
+template <typename T>
+__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
+{
+    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+template <typename T>
+__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
+{
+    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index b9eb24858..82d35b9c5 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -25,6 +25,7 @@
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index 6387233c0..8ffe681f9 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -35,4 +35,40 @@ struct GemmTilePartitioner
         return make_tuple(iM, iN);
     }
 };
+
+template <typename BlockGemmShape_>
+struct GemmTile1DPartitioner
+{
+    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
+
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N)
+    {
+        index_t GridDimX = (M + MPerBlock - 1) / MPerBlock;
+        index_t GridDimY = (N + NPerBlock - 1) / NPerBlock;
+        return dim3(GridDimX * GridDimY, 1, 1);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetNBlock(index_t N)
+    {
+        return integer_divide_ceil(N, NPerBlock);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K)
+    {
+        return integer_divide_ceil(K, KPerBlock);
+    }
+
+    CK_TILE_DEVICE auto operator()(index_t blockOffset, index_t NBlockSize)
+    {
+        index_t iM = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) /
+                                                    GetNBlock(NBlockSize) * MPerBlock);
+        index_t iN = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) %
+                                                    GetNBlock(NBlockSize) * NPerBlock);
+        return make_tuple(iM, iN);
+    }
+};
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
new file mode 100644
index 000000000..f24fc47af
--- /dev/null
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/utility/literals.hpp"
+#include "ck_tile/core/utility/amd_address_space.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host.hpp"
+
+namespace ck_tile {
+
+struct GroupedGemmHostArgs
+{
+    const void* a_ptr;
+    const void* b_ptr;
+    void* c_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct GroupedGemmKernel
+{
+    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    struct GemmTransKernelArg
+    {
+        GroupedGemmHostArgs group_karg;
+        ck_tile::index_t block_start;
+        ck_tile::index_t block_end;
+
+        GemmTransKernelArg() = default;
+        GemmTransKernelArg(GroupedGemmHostArgs&& karg, index_t bl_start, index_t bl_end)
+            : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
+        {
+        }
+    };
+
+    __host__ static size_t GetWorkSpaceSize(const std::vector<GroupedGemmHostArgs>& gemm_descs)
+    {
+        return gemm_descs.size() * sizeof(GemmTransKernelArg);
+    }
+
+    __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    using Hargs = GroupedGemmHostArgs;
+
+    __host__ static constexpr auto GridSize(const std::vector<Hargs>& gemm_descs)
+    {
+        index_t grid_size = 0;
+        for(const auto& it_desc : gemm_descs)
+        {
+            const auto dim3 = TilePartitioner::GridSize(it_desc.M, it_desc.N);
+            grid_size += dim3.x * dim3.y * 1;
+        }
+        return dim3(grid_size, 1, 1);
+    }
+
+    CK_TILE_HOST static auto MakeKargs(const std::vector<Hargs>& gemm_descs)
+    {
+        std::vector<GemmTransKernelArg> gemm_kernel_args_;
+        index_t group_count = ck_tile::type_convert<ck_tile::index_t>(gemm_descs.size());
+        index_t grid_size   = 0;
+        gemm_kernel_args_.reserve(group_count);
+
+        for(std::size_t i = 0; i < gemm_descs.size(); ++i)
+        {
+            const index_t M = gemm_descs[i].M;
+            const index_t N = gemm_descs[i].N;
+            const index_t K = gemm_descs[i].K;
+
+            if(M == 0 || N == 0 || K == 0)
+            {
+                continue;
+            }
+
+            const index_t stride_a = gemm_descs[i].stride_A;
+            const index_t stride_b = gemm_descs[i].stride_B;
+            const index_t stride_c = gemm_descs[i].stride_C;
+
+            const auto dim3             = TilePartitioner::GridSize(M, N);
+            const index_t grid_size_grp = dim3.x * 1 * 1;
+
+            const index_t block_start = grid_size;
+            const index_t block_end   = grid_size + grid_size_grp;
+
+            grid_size += grid_size_grp;
+
+            auto karg = GroupedGemmHostArgs{type_convert<const ADataType*>(gemm_descs[i].a_ptr),
+                                            type_convert<const BDataType*>(gemm_descs[i].b_ptr),
+                                            type_convert<CDataType*>(gemm_descs[i].c_ptr),
+                                            M,
+                                            N,
+                                            K,
+                                            stride_a,
+                                            stride_b,
+                                            stride_c};
+
+            gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end);
+        }
+
+        return gemm_kernel_args_;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_DEVICE void Run(const Hargs& kargs, const index_t block_start) const
+    {
+        const auto [i_m, i_n] = TilePartitioner{}(block_start, kargs.N);
+        // options
+        const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr);
+        const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
+        // Convert pointers to tensor views
+        auto a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_start,
+                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::VectorSizeA>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_start,
+                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(1, kargs.stride_A),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        auto b_tensor_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_start,
+                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(1, kargs.stride_B),
+                    number<1>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_start,
+                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(kargs.stride_B, 1),
+                    number<GemmPipeline::VectorSizeB>{},
+                    number<1>{});
+            }
+        }();
+
+        auto a_pad_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        // clang-format on
+
+        auto a_block_window = make_tile_window(
+            a_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
+            {i_m, 0});
+
+        auto b_pad_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<GemmPipeline::kPadN, false>{});
+            }
+        }();
+
+        auto b_block_window = make_tile_window(
+            b_pad_view,
+            make_tuple(number<TilePartitioner::NPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
+            {i_n, 0});
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
+
+        // Run GEMM cooperatively by whole wokrgroup.
+        auto c_block_tile =
+            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
+
+        CDataType* c_start = static_cast<CDataType*>(kargs.c_ptr);
+        auto c_tensor_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_start,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<GemmPipeline::VectorSizeC>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_start,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        auto c_pad_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+        auto CBlockWindow_pad = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        EpiloguePipeline{}(CBlockWindow_pad, c_block_tile);
+    }
+
+    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                   int group_count) const
+    {
+        const index_t block_id   = ck_tile::get_block_1d_id();
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+
+        index_t left     = 0;
+        index_t right    = group_count;
+        index_t group_id = index_t((left + right) / 2);
+
+        while((!(block_id >= gemm_desc_ptr[group_id].block_start &&
+                 block_id < gemm_desc_ptr[group_id].block_end)) &&
+              left <= right)
+        {
+            if(block_id < gemm_desc_ptr[group_id].block_start)
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) / 2);
+        }
+
+        Run(gemm_desc_ptr[group_id].group_karg, gemm_desc_ptr[group_id].block_start);
+    }
+};
+
+} // namespace ck_tile
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index fd0de0f9c..77cf35f66 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(image_to_column)
 add_subdirectory(gemm)
 add_subdirectory(batched_gemm)
+add_subdirectory(grouped_gemm)
diff --git a/test/ck_tile/grouped_gemm/CMakeLists.txt b/test/ck_tile/grouped_gemm/CMakeLists.txt
new file mode 100644
index 000000000..f4845847f
--- /dev/null
+++ b/test/ck_tile/grouped_gemm/CMakeLists.txt
@@ -0,0 +1,4 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_grouped_gemm test_grouped_gemm.cpp)
+endif()
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
new file mode 100644
index 000000000..1bce0f8aa
--- /dev/null
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_grouped_gemm_util.hpp"
+
+using F16 = ck_tile::half_t;
+using F32 = float;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
+    //std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>//,
+    //std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGroupedGemm, KernelTypes);
+
+#include "test_grouped_gemm_ut_cases.inc"
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
new file mode 100644
index 000000000..68c4693bb
--- /dev/null
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
@@ -0,0 +1,25 @@
+#pragma once
+
+TYPED_TEST(TestCkTileGroupedGemm, Basic)
+{
+    const int group_count = 16;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Cs;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * i);
+        Ns.push_back(128 + 128 * i);
+        Ks.push_back(128 + 64 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Cs.push_back(Ns[i]);
+    }
+
+    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, group_count);
+}
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
new file mode 100644
index 000000000..f532de21d
--- /dev/null
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <sstream>
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+
+template <typename Tuple>
+class TestCkTileGroupedGemm : public ::testing::Test
+{
+    protected:
+    using ALayout     = std::tuple_element_t<0, Tuple>;
+    using BLayout     = std::tuple_element_t<1, Tuple>;
+    using CLayout     = std::tuple_element_t<2, Tuple>;
+    using ADataType   = std::tuple_element_t<3, Tuple>;
+    using BDataType   = std::tuple_element_t<4, Tuple>;
+    using AccDataType = std::tuple_element_t<5, Tuple>;
+    using CDataType   = std::tuple_element_t<6, Tuple>;
+
+    struct GroupedGemKernelParam
+    {
+        static const bool kPadM        = false;
+        static const bool kPadN        = false;
+        static const bool kPadK        = false;
+        static const bool kTilePermute = false;
+
+        static const ck_tile::index_t kOutputRank = 2;
+
+        static const int kBlockPerCu         = 1;
+        static const ck_tile::index_t M_Tile = 128;
+        static const ck_tile::index_t N_Tile = 128;
+        static const ck_tile::index_t K_Tile = 32;
+
+        static const ck_tile::index_t M_Warp = 2;
+        static const ck_tile::index_t N_Warp = 2;
+        static const ck_tile::index_t K_Warp = 1;
+
+        static const ck_tile::index_t M_Warp_Tile = 32;
+        static const ck_tile::index_t N_Warp_Tile = 32;
+        static const ck_tile::index_t K_Warp_Tile = 8;
+    };
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<GroupedGemKernelParam::M_Tile,
+                                                 GroupedGemKernelParam::N_Tile,
+                                                 GroupedGemKernelParam::K_Tile>,
+                               ck_tile::sequence<GroupedGemKernelParam::M_Warp,
+                                                 GroupedGemKernelParam::N_Warp,
+                                                 GroupedGemKernelParam::K_Warp>,
+                               ck_tile::sequence<GroupedGemKernelParam::M_Warp_Tile,
+                                                 GroupedGemKernelParam::N_Warp_Tile,
+                                                 GroupedGemKernelParam::K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+    template <typename CLayout>
+    using GemmEpilogue =
+        std::conditional_t<std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>,
+                           ck_tile::CShuffleEpilogue<
+                               ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                                                CDataType,
+                                                                GroupedGemKernelParam::kPadM,
+                                                                GroupedGemKernelParam::kPadN,
+                                                                GroupedGemKernelParam::kTilePermute,
+                                                                GroupedGemKernelParam::kOutputRank,
+                                                                1,
+                                                                0,
+                                                                TilePartitioner::MPerBlock,
+                                                                TilePartitioner::NPerBlock>>,
+                           ck_tile::Default2DEpilogue<
+                               ck_tile::Default2DEpilogueProblem<AccDataType,
+                                                                 CDataType,
+                                                                 GroupedGemKernelParam::kPadM,
+                                                                 GroupedGemKernelParam::kPadN>>>;
+
+    template <typename ALayout, typename BLayout, typename CLayout>
+    using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
+                                                      GroupedGemKernelParam::kPadN,
+                                                      GroupedGemKernelParam::kPadK,
+                                                      ALayout,
+                                                      BLayout,
+                                                      CLayout>;
+
+    template <typename ALayout, typename BLayout, typename CLayout>
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType,
+                                     BDataType,
+                                     AccDataType,
+                                     CodegenGemmShape,
+                                     CodegenGemmTraits<ALayout, BLayout, CLayout>>;
+
+    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy;
+
+    template <typename ALayout, typename BLayout, typename CLayout>
+    using CodegenGemmPipeline =
+        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>,
+                                              CodegenGemmPolicy>;
+
+    template <typename ALayout, typename BLayout, typename CLayout>
+    using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
+                                              CodegenGemmPipeline<ALayout, BLayout, CLayout>,
+                                              GemmEpilogue<CLayout>>;
+
+    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
+    std::size_t GetWorkspaceSize(const std::vector<grouped_gemm_kargs>& gemm_descs)
+    {
+        return Kernel<std::nullptr_t, std::nullptr_t, std::nullptr_t>::GetWorkSpaceSize(gemm_descs);
+    }
+
+    template <typename ALayout, typename BLayout, typename CLayout>
+    void invoke_grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                             const ck_tile::stream_config& s,
+                             void* p_workspace_)
+    {
+        using GroupedGemmKernel = Kernel<ALayout, BLayout, CLayout>;
+
+        auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs);
+
+        const dim3 grids      = GroupedGemmKernel::GridSize(gemm_descs);
+        constexpr dim3 blocks = GroupedGemmKernel::BlockSize();
+
+        ck_tile::hip_check_error(hipMemcpyWithStream(
+            p_workspace_,
+            arguments.data(),
+            arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg),
+            hipMemcpyHostToDevice,
+            s.stream_id_));
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        ck_tile::launch_kernel(s,
+                               ck_tile::make_kernel<blocks.x, GroupedGemKernelParam::kBlockPerCu>(
+                                   GroupedGemmKernel{},
+                                   grids,
+                                   blocks,
+                                   0,
+                                   ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
+                                   gemm_descs.size()));
+    }
+
+    public:
+    void Run(const std::vector<int>& Ms,
+             const std::vector<int>& Ns,
+             const std::vector<int>& Ks,
+             std::vector<int>& stride_As,
+             std::vector<int>& stride_Bs,
+             std::vector<int>& stride_Cs,
+             const int group_count = 16)
+    {
+        using namespace ck_tile::literals;
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+        std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+        std::vector<ck_tile::HostTensor<CDataType>> c_m_n_tensors;
+
+        a_m_k_tensors.reserve(group_count);
+        b_k_n_tensors.reserve(group_count);
+        c_m_n_tensors.reserve(group_count);
+
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+        std::vector<std::unique_ptr<ck_tile::DeviceMem>> c_m_n_dev_buf;
+
+        a_m_k_dev_buf.reserve(group_count);
+        b_k_n_dev_buf.reserve(group_count);
+        c_m_n_dev_buf.reserve(group_count);
+
+        std::vector<grouped_gemm_kargs> gemm_descs;
+        gemm_descs.reserve(group_count);
+
+        for(int i = 0; i < group_count; ++i)
+        {
+            const ck_tile::index_t M = Ms[i];
+            const ck_tile::index_t N = Ns[i];
+            const ck_tile::index_t K = Ks[i];
+
+            stride_As[i] = f_get_default_stride(M, N, stride_As[i], ALayout{});
+            stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], BLayout{});
+            stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{});
+
+            a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>(
+                f_host_tensor_descriptor(M, K, stride_As[i], ALayout{})));
+            b_k_n_tensors.push_back(ck_tile::HostTensor<BDataType>(
+                f_host_tensor_descriptor(K, N, stride_Bs[i], BLayout{})));
+            c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
+                f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{})));
+
+            std::cout << "gemm[" << i << "]"
+                      << " a_m_k: " << a_m_k_tensors[i].mDesc
+                      << " b_k_n: " << b_k_n_tensors[i].mDesc
+                      << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl;
+
+            ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tensors[i]);
+            ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors[i]);
+
+            a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                a_m_k_tensors[i].get_element_space_size_in_bytes()));
+            b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                b_k_n_tensors[i].get_element_space_size_in_bytes()));
+            c_m_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
+                c_m_n_tensors[i].get_element_space_size_in_bytes()));
+
+            a_m_k_dev_buf[i]->ToDevice(a_m_k_tensors[i].data());
+            b_k_n_dev_buf[i]->ToDevice(b_k_n_tensors[i].data());
+            c_m_n_dev_buf[i]->SetZero();
+            c_m_n_tensors[i].SetZero();
+
+            const void* p_a = a_m_k_dev_buf[i]->GetDeviceBuffer();
+            const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
+            void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
+
+            gemm_descs.push_back(
+                {p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+        }
+
+        ck_tile::DeviceMem gemm_workspace;
+        gemm_workspace.Realloc(GetWorkspaceSize(gemm_descs));
+
+        invoke_grouped_gemm<ALayout, BLayout, CLayout>(
+            gemm_descs, ck_tile::stream_config{nullptr, false}, gemm_workspace.GetDeviceBuffer());
+
+        for(int i = 0; i < group_count; i++)
+        {
+            c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data());
+        }
+
+        bool pass{true};
+        for(int i = 0; i < group_count; ++i)
+        {
+            ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+                f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{}));
+            c_m_n_host_ref.SetZero();
+            ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+                a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
+            pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
-- 
GitLab


From d2d1d177ffe04f0ff25fed0aedcb3ede0e07c51b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Dec 2024 22:05:47 -0800
Subject: [PATCH 101/153] Bump rocm-docs-core from 1.10.0 to 1.11.0 in
 /docs/sphinx (#1720)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.10.0 to 1.11.0.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.10.0...v1.11.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 9969824d2..d1b3465b9 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.10.0
+rocm-docs-core==1.11.0
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index bb731db2d..26d0aa244 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.10.0
+rocm-docs-core==1.11.0
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From feb9a2bd9b50da9d449e5931e936d527a0db89fe Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Thu, 5 Dec 2024 09:02:13 +0100
Subject: [PATCH 102/153] Add IsSupportedArgument() to gemm_kernel (#1698)

* add IsSupportedArgument to gemm_kernel

* add ut and do some refactoring

* switched to ck_tile's integral_constant
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |  5 ++
 example/ck_tile/03_gemm/universal_gemm.cpp    |  5 ++
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 73 +++++++++++++++++++
 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp  | 42 +++++------
 .../gemm/test_gemm_mem_pipeline_ut_cases.inc  | 59 +++------------
 .../gemm/test_gemm_mem_pipeline_util.hpp      | 22 ++++--
 6 files changed, 128 insertions(+), 78 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index b7d869344..f5260c306 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -92,6 +92,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
     const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
     constexpr dim3 blocks = Kernel::BlockSize();
 
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
+
     if(s.log_level_ > 0)
     {
         std::cout << "Launching kernel with args:"
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index eaafc13b9..6c87ca008 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -119,6 +119,11 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
         const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
         constexpr dim3 blocks = Kernel::BlockSize();
 
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
         if(s.log_level_ > 0)
         {
             std::cout << "Launching kernel with args:"
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 96af6e826..763d8cad9 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -66,6 +66,79 @@ struct GemmKernel
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
+    CK_TILE_HOST static bool IsSupportedArgument(const GemmCommonKargs& kargs)
+    {
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false)
+            {
+                return false;
+            }
+            if(kargs.K % GemmPipeline::VectorSizeA != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false)
+            {
+                return false;
+            }
+            if(kargs.M % GemmPipeline::VectorSizeA != 0)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false)
+            {
+                return false;
+            }
+            if(kargs.N % GemmPipeline::VectorSizeB != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false)
+            {
+                return false;
+            }
+            if(kargs.K % GemmPipeline::VectorSizeB != 0)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false)
+            {
+                return false;
+            }
+            if(kargs.N % GemmPipeline::VectorSizeC != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false)
+            {
+                return false;
+            }
+            if(kargs.M % GemmPipeline::VectorSizeC != 0)
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
     CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const
     {
         const auto [i_m, i_n] = TilePartitioner{}();
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
index a1c80fee4..aeb383c87 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
@@ -8,35 +8,29 @@
 #include "ck_tile/host.hpp"
 #include "test_gemm_mem_pipeline_util.hpp"
 
-using F16 = ck_tile::half_t;
-using F32 = float;
-
-using Row                       = ck_tile::tensor_layout::gemm::RowMajor;
-using Col                       = ck_tile::tensor_layout::gemm::ColumnMajor;
-static constexpr auto Intrawave = ck_tile::GemmPipelineScheduler::Intrawave;
-static constexpr auto Interwave = ck_tile::GemmPipelineScheduler::Interwave;
-
-template <typename Tuple>
-class TestCkTileGemmMemPipelineIntrawave : public TestCkTileGemmMemPipeline<Tuple, Intrawave>
-{
-};
-
-template <typename Tuple>
-class TestCkTileGemmMemPipelineInterwave : public TestCkTileGemmMemPipeline<Tuple, Interwave>
-{
-};
+using F16       = ck_tile::half_t;
+using F32       = float;
+using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
+                                             ck_tile::GemmPipelineScheduler::Intrawave>;
+using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
+                                             ck_tile::GemmPipelineScheduler::Interwave>;
 
 // clang-format off
 using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>
     >;
 // clang-format on
 
-TYPED_TEST_SUITE(TestCkTileGemmMemPipelineIntrawave, KernelTypes);
-TYPED_TEST_SUITE(TestCkTileGemmMemPipelineInterwave, KernelTypes);
+TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes);
 
 #include "test_gemm_mem_pipeline_ut_cases.inc"
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
index 6b914e797..af94d68f2 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
@@ -3,11 +3,7 @@
 
 #pragma once
 
-//------------------------------------------------------------------------------------------------
-//              INTERWAVE SCHEDULER
-//------------------------------------------------------------------------------------------------
-
-TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM)
+TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 1024;
@@ -17,7 +13,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, SmallM)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM)
+TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 1024;
@@ -27,7 +23,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, MidLargeM)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK)
+TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 1024;
@@ -37,7 +33,7 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, PaddK)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular)
+TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 1024;
@@ -47,46 +43,15 @@ TYPED_TEST(TestCkTileGemmMemPipelineInterwave, Regular)
         this->Run(M, N, K);
 }
 
-//------------------------------------------------------------------------------------------------
-//              INTRAWAVE SCHEDULER
-//------------------------------------------------------------------------------------------------
-
-TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, SmallM)
+TYPED_TEST(TestCkTileGemmMemPipeline, NotSupportedArgument)
 {
-    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
-    constexpr int N = 1024;
-    constexpr int K = 320;
-
-    for(int M : Ms)
-        this->Run(M, N, K);
-}
+    constexpr int M = 512;
+    constexpr int N = 1025;
+    constexpr int K = 513;
 
-TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 1024;
-    constexpr int K = 320;
-
-    for(int M : Ms)
-        this->Run(M, N, K);
-}
+    constexpr bool PadM = false;
+    constexpr bool PadN = false;
+    constexpr bool PadK = false;
 
-TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, PaddK)
-{
-    std::vector<int> Ms{127};
-    constexpr int N = 1024;
-    constexpr int K = 432;
-
-    for(int M : Ms)
-        this->Run(M, N, K);
-}
-
-TYPED_TEST(TestCkTileGemmMemPipelineIntrawave, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 1024;
-    constexpr int K = 512;
-
-    for(int M : Ms)
-        this->Run(M, N, K);
+    EXPECT_THROW((this->template Run<PadM, PadN, PadK>(M, N, K)), std::runtime_error);
 }
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
index 15f9f516e..6941a7596 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
@@ -11,7 +11,7 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
-template <typename Tuple, ck_tile::GemmPipelineScheduler Scheduler_>
+template <typename Tuple>
 class TestCkTileGemmMemPipeline : public ::testing::Test
 {
     protected:
@@ -22,7 +22,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
     using BDataType                 = std::tuple_element_t<4, Tuple>;
     using AccDataType               = std::tuple_element_t<5, Tuple>;
     using CDataType                 = std::tuple_element_t<6, Tuple>;
-    static constexpr auto Scheduler = Scheduler_;
+    static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value;
     // TODO: expose tile size through test t-param ?
 
     struct gemm_args
@@ -39,6 +39,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
         ck_tile::index_t stride_C;
     };
 
+    template <bool PadM, bool PadN, bool PadK>
     void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s)
     {
         // TODO: This should be parameterized in tests
@@ -54,9 +55,9 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
         constexpr ck_tile::index_t N_Warp_Tile = 32;
         constexpr ck_tile::index_t K_Warp_Tile = 8;
 
-        constexpr bool kPadM = true;
-        constexpr bool kPadN = true;
-        constexpr bool kPadK = true;
+        constexpr bool kPadM = PadM;
+        constexpr bool kPadN = PadN;
+        constexpr bool kPadK = PadK;
 
         constexpr int kBlockPerCu = 1;
 
@@ -107,6 +108,11 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
             const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
             constexpr dim3 blocks = Kernel::BlockSize();
 
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
             if(s.log_level_ > 0)
             {
                 std::cout << "Launching kernel with args:"
@@ -212,6 +218,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
 
     void SetUp() override { k_batches_ = {1}; }
 
+    template <bool PadM = true, bool PadN = true, bool PadK = true>
     void Run(const int M,
              const int N,
              const int K,
@@ -221,10 +228,11 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
     {
         for(auto kb : k_batches_)
         {
-            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
+            RunSingle<PadM, PadN, PadK>(M, N, K, StrideA, StrideB, StrideC, kb);
         }
     }
 
+    template <bool PadM, bool PadN, bool PadK>
     void RunSingle(const int M,
                    const int N,
                    const int K,
@@ -301,7 +309,7 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
         args.stride_B = stride_B;
         args.stride_C = stride_C;
 
-        invoke_gemm(args, ck_tile::stream_config{nullptr, false});
+        invoke_gemm<PadM, PadN, PadK>(args, ck_tile::stream_config{nullptr, false});
 
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
         bool pass = true;
-- 
GitLab


From 86990558e39a99d3e2dd909e45f5d38c3b13d956 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 5 Dec 2024 17:29:12 -0800
Subject: [PATCH 103/153] Upgrade default compiler to ROCm6.3 (#1723)

* upgrade to rocm6.3 compiler

* Proposed solution to convnd test failures in ROCm 6.3

---------

Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
---
 Dockerfile                                    | 13 ++++--------
 Dockerfile.compiler                           |  2 +-
 Jenkinsfile                                   | 21 ++++++++++---------
 .../convscale/convnd_fwd_convscale_common.hpp |  9 ++++----
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f9b7d76e3..6689ae08f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:20.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.2
+ARG ROCMVERSION=6.3
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
@@ -13,17 +13,12 @@ RUN set -xe && \
     apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
     curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
-RUN if [ "$ROCMVERSION" != "6.3" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \
+RUN if [ "$ROCMVERSION" != "6.4" ]; then \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb  --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
         wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
         sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
         sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=2074281; \
     fi
 
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index 354b71f69..3f3329092 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
diff --git a/Jenkinsfile b/Jenkinsfile
index f8493fa2f..58cd72c8c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -38,13 +38,14 @@ def getBaseDockerImageName(){
         img = "${params.USE_CUSTOM_DOCKER}"
     }
     else{
-    if (params.ROCMVERSION != "6.3"){
-        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
-        }
-    else{
-        img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+        def ROCM_numeric = "${params.ROCMVERSION}" as float
+        if ( ROCM_numeric < 6.4 ){
+            img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
+            }
+        else{
+            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+            }
         }
-    }
     return img
 }
 
@@ -739,8 +740,8 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+                                              0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
@@ -765,8 +766,8 @@ pipeline {
             description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
         string(
             name: 'ROCMVERSION', 
-            defaultValue: '6.2', 
-            description: 'Specify which ROCM version to use: 6.2 (default).')
+            defaultValue: '6.3',
+            description: 'Specify which ROCM version to use: 6.3 (default).')
         string(
             name: 'COMPILER_VERSION', 
             defaultValue: '', 
diff --git a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp
index 978221f8e..bf560f8a4 100644
--- a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp
@@ -172,12 +172,13 @@ bool run_grouped_conv_fwd(bool do_verification,
     {
     case 0: break;
     case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        // values generated: -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 6});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
         break;
     default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
     }
 
     DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-- 
GitLab


From 58e7f37fc892c1e7aeca338f96ec694712e6e412 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Fri, 6 Dec 2024 12:59:58 +0800
Subject: [PATCH 104/153] Undo padding-flag changes in fmha_fwd_kernel.hpp
 (#1725)

---
 .../ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp   | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 3a66b78a5..3de433d6a 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -998,14 +998,14 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<false, kPadHeadDimQ>{});
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
             }
             else
             {
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<false, kPadHeadDimQ>{});
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
             }
         }();
         const auto k_dram = [&]() {
@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel
             return pad_tensor_view(
                 k_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<false, kPadHeadDimQ>{});
+                sequence<kPadSeqLenK, kPadHeadDimQ>{});
         }();
         const auto v_dram = [&]() {
             if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     v_dram_transposed,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, false>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
             }
             else
             {
@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     v_dram_naive,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<false, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
             }
         }();
 
@@ -1097,8 +1097,9 @@ struct FmhaFwdKernel
                         number<FmhaPipeline::kAlignmentBias>{},
                         number<1>{});
 
-                    return pad_tensor_view(
-                        bias_dram_naive, bias_dram_window_lengths, sequence<false, kPadSeqLenK>{});
+                    return pad_tensor_view(bias_dram_naive,
+                                           bias_dram_window_lengths,
+                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
                 }();
 
                 return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-- 
GitLab


From 261f1759de15fd319ba03985ebe7123fae12a722 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 6 Dec 2024 10:55:23 +0100
Subject: [PATCH 105/153] Support large batch tensors in grouped conv bwd data
 (#1711)

* Support large batch tensors in grouped conv bwd data

* Fix multiD

* fixes

* fixes

* fixes
---
 ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp |  186 +--
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp |  294 ++--
 .../transform_conv_bwd_data_to_gemm_v1.hpp    | 1275 ++++++++++-------
 test/grouped_convnd_bwd_data/CMakeLists.txt   |    8 +-
 .../test_grouped_convnd_bwd_data_wmma.cpp     |  108 ++
 ...p => test_grouped_convnd_bwd_data_xdl.cpp} |   39 +-
 6 files changed, 1067 insertions(+), 843 deletions(-)
 create mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
 rename test/grouped_convnd_bwd_data/{test_grouped_convnd_bwd_data_xdl_wmma.cpp => test_grouped_convnd_bwd_data_xdl.cpp} (78%)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
index 3fb047f20..359711e5c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
     static constexpr auto I3           = Number<3>{};
     static constexpr index_t KPerBlock = K0PerBlock * K1;
 
-    static constexpr auto transform_conv_to_gemm =
-        TransformConvBwdDataToGemm_v1<NDimSpatial,
-                                      ConvBackwardDataSpecialization,
-                                      K1,
-                                      K1,
-                                      MPerBlock,
-                                      NPerBlock,
-                                      KPerBlock,
-                                      true /* DoPadGemmM */,
-                                      true /* DoPadGemmN */>{};
-
-    static auto GetDummyABDsEGridDescriptor()
-    {
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
-        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
-
-        const auto a_grid_desc_ak0_m_ak1 =
-            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto b_grid_desc_bk0_n_bk1 =
-            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto ds_grid_desc_m_n = generate_tuple(
-            [&](auto i) {
-                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths);
-            },
-            Number<NumDTensor>{});
-
-        const auto e_grid_desc_m_n =
-            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths);
+    using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                                     ConvBackwardDataSpecialization,
+                                                                     K1,
+                                                                     K1,
+                                                                     MPerBlock,
+                                                                     NPerBlock,
+                                                                     KPerBlock,
+                                                                     true /* DoPadGemmM */,
+                                                                     true /* DoPadGemmN */,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     ELayout>;
 
+    static auto
+    GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1();
+        const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1();
+        const auto ds_grid_desc_m_n =
+            generate_tuple([&](auto) { return conv_to_gemm_transform.MakeCDescriptor_M_N(); },
+                           Number<NumDTensor>{});
+        const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();
         return make_tuple(
             a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
     }
 
     // desc
-    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+    constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform;
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform));
 
     using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
     using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
                  const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
-                     ds_g_n_c_wis_lengths,
+                 /*ds_g_n_c_wis_lengths*/,
                  const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
                      ds_g_n_c_wis_strides,
                  const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths,
@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
               a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
-              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
-              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
-              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
-              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
-              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
               conv_filter_strides_{conv_filter_strides},
-              conv_filter_dilations_{conv_filter_dilations},
               input_left_pads_{input_left_pads},
               input_right_pads_{input_right_pads}
         {
@@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                             tildes = {i_ztilde, i_ytilde, i_xtilde};
                         }
 
+                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
+                                                                           a_g_n_k_wos_strides,
+                                                                           b_g_k_c_xs_lengths,
+                                                                           b_g_k_c_xs_strides,
+                                                                           e_g_n_c_wis_lengths,
+                                                                           e_g_n_c_wis_strides,
+                                                                           conv_filter_strides,
+                                                                           conv_filter_dilations,
+                                                                           input_left_pads,
+                                                                           input_right_pads,
+                                                                           tildes};
+
                         const auto a_grid_desc_ak0_m_ak1 =
-                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();
 
                         const auto b_grid_desc_bk0_n_bk1 =
-                            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();
 
                         DsGridDesc_M_N ds_grid_desc_m_n;
 
                         // populate Ds desc
                         static_for<0, NumDTensor, 1>{}([&](auto i) {
                             using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                            ds_grid_desc_m_n(i) =
-                                transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                                    a_g_n_k_wos_lengths,
-                                    a_g_n_k_wos_strides,
-                                    b_g_k_c_xs_lengths,
-                                    b_g_k_c_xs_strides,
-                                    ds_g_n_c_wis_lengths[i],
-                                    ds_g_n_c_wis_strides[i],
-                                    conv_filter_strides,
-                                    conv_filter_dilations,
-                                    input_left_pads,
-                                    input_right_pads,
-                                    tildes);
-                        });
-
-                        const auto e_grid_desc_m_n =
-                            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            static_assert(is_same_v<DLayout, ELayout>);
+                            ConvToGemmBwdDataTransform conv_to_gemm_transform_d{
                                 a_g_n_k_wos_lengths,
                                 a_g_n_k_wos_strides,
                                 b_g_k_c_xs_lengths,
                                 b_g_k_c_xs_strides,
                                 e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
+                                ds_g_n_c_wis_strides[i],
                                 conv_filter_strides,
                                 conv_filter_dilations,
                                 input_left_pads,
                                 input_right_pads,
-                                tildes);
+                                tildes};
+
+                            ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N();
+                        });
+
+                        const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N();
 
                         // for check validity
                         ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
         BElementwiseOp b_element_op_;
         CDEElementwiseOp cde_element_op_;
 
-        // for checking IsSupportedArgument()
         std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
-        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
         std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
-        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
         std::array<index_t, NDimSpatial> conv_filter_strides_;
-        std::array<index_t, NDimSpatial> conv_filter_dilations_;
         std::array<index_t, NDimSpatial> input_left_pads_;
         std::array<index_t, NDimSpatial> input_right_pads_;
     };
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index b544c925e..c8c58d5d8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -54,15 +54,16 @@ template <typename GridwiseGemm,
           typename ABDataType,
           typename DsPointer,
           typename EDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp,
           typename AGridDesc_AK0_M_AK1,
           typename BGridDesc_BK0_N_BK1,
           typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
           typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
           typename Block2ETileMap,
           typename ComputePtrOffsetOfBatch,
+          typename ComputePtrOffsetOfN,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -73,10 +74,9 @@ __global__ void
             const ABDataType* __restrict__ p_b_grid,
             DsPointer p_ds_grid,
             EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const index_t batch_count,
+            const AElementwiseOp a_element_op,
+            const BElementwiseOp b_element_op,
+            const CDEElementwiseOp cde_element_op,
             const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
             const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
             const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -84,24 +84,29 @@ __global__ void
             const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                 e_grid_desc_mblock_mperblock_nblock_nperblock_,
             const Block2ETileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
     // offset base pointer for each work-group
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
 
-    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
 
     const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     DsPointer p_ds_grid_grp;
@@ -112,10 +117,10 @@ __global__ void
     static_for<0, NumDTensor, 1>{}(
         [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset + a_n_offset,
                                                   p_b_grid + b_batch_offset,
                                                   p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset,
+                                                  p_e_grid + e_batch_offset + e_n_offset,
                                                   p_shared,
                                                   a_element_op,
                                                   b_element_op,
@@ -130,7 +135,6 @@ __global__ void
     ignore = p_b_grid;
     ignore = p_ds_grid;
     ignore = p_e_grid;
-    ignore = batch_count;
     ignore = a_grid_desc_ak0_m_ak1;
     ignore = b_grid_desc_bk0_n_bk1;
     ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
@@ -139,6 +143,7 @@ __global__ void
     ignore = b_element_op;
     ignore = cde_element_op;
     ignore = compute_ptr_offset_of_batch;
+    ignore = compute_ptr_offset_of_n;
     ignore = block_2_ctile_map;
 #endif
 }
@@ -233,82 +238,54 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
-    static constexpr auto transform_conv_to_gemm =
-        TransformConvBwdDataToGemm_v1<NDimSpatial,
-                                      ConvBackwardDataSpecialization,
-                                      AK1,
-                                      BK1,
-                                      MPerBlock,
-                                      NPerBlock,
-                                      KPerBlock,
-                                      DoPadGemmM,
-                                      DoPadGemmN>{};
-
-    static auto GetDummyABDsEGridDescriptor()
+    using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                                     ConvBackwardDataSpecialization,
+                                                                     AK1,
+                                                                     BK1,
+                                                                     MPerBlock,
+                                                                     NPerBlock,
+                                                                     KPerBlock,
+                                                                     DoPadGemmM,
+                                                                     DoPadGemmN,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     ELayout,
+                                                                     true, /*SplitConvN*/
+                                                                     ABDataType,
+                                                                     EDataType>;
+
+    static auto
+    GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
     {
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
-        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
-
-        const auto a_grid_desc_ak0_m_ak1 =
-            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto b_grid_desc_bk0_n_bk1 =
-            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
+        const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1();
+
+        const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1();
 
         const auto ds_grid_desc_m_n = generate_tuple(
             [&](auto i) {
-                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths);
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                using ConvToGemmBwdDataTransformD =
+                    TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                  ConvBackwardDataSpecialization,
+                                                  AK1,
+                                                  BK1,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  DoPadGemmM,
+                                                  DoPadGemmN,
+                                                  ALayout,
+                                                  BLayout,
+                                                  DLayout,
+                                                  true, /*SplitConvN*/
+                                                  ABDataType,
+                                                  DDataType>;
+                return ConvToGemmBwdDataTransformD{}.MakeCDescriptor_M_N();
             },
             Number<NumDTensor>{});
 
-        const auto e_grid_desc_m_n =
-            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths);
+        const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();
 
         return make_tuple(
             a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
@@ -377,7 +354,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     }
 
     // desc
-    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+    constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform;
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform));
 
     using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
     using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
@@ -431,15 +409,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
               b_element_op_{b_element_op},
               cde_element_op_{cde_element_op},
               a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
-              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
-              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
-              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
-              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
-              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
               conv_filter_strides_{conv_filter_strides},
-              conv_filter_dilations_{conv_filter_dilations},
               input_left_pads_{input_left_pads},
               input_right_pads_{input_right_pads}
         {
@@ -450,11 +421,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                 p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
             });
 
-            // A/B/Ds/E Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
-
             static_for<0, NumDTensor, 1>{}([&](auto i) {
                 compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0];
             });
@@ -526,68 +492,65 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             throw std::runtime_error("wrong! only implemented for 2D and 3D now");
                         }
 
+                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
+                                                                           a_g_n_k_wos_strides,
+                                                                           b_g_k_c_xs_lengths,
+                                                                           b_g_k_c_xs_strides,
+                                                                           e_g_n_c_wis_lengths,
+                                                                           e_g_n_c_wis_strides,
+                                                                           conv_filter_strides,
+                                                                           conv_filter_dilations,
+                                                                           input_left_pads,
+                                                                           input_right_pads,
+                                                                           tildes};
+
+                        conv_N_per_block_ = conv_to_gemm_transform_.N_;
+
                         const auto a_grid_desc_ak0_m_ak1 =
-                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();
 
                         const auto b_grid_desc_bk0_n_bk1 =
-                            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();
 
                         DsGridDesc_M_N ds_grid_desc_m_n;
 
                         // populate Ds desc
                         static_for<0, NumDTensor, 1>{}([&](auto i) {
-                            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                            ds_grid_desc_m_n(i) =
-                                transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                                    a_g_n_k_wos_lengths,
-                                    a_g_n_k_wos_strides,
-                                    b_g_k_c_xs_lengths,
-                                    b_g_k_c_xs_strides,
-                                    ds_g_n_c_wis_lengths[i],
-                                    ds_g_n_c_wis_strides[i],
-                                    conv_filter_strides,
-                                    conv_filter_dilations,
-                                    input_left_pads,
-                                    input_right_pads,
-                                    tildes);
-                        });
-
-                        const auto e_grid_desc_m_n =
-                            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                            using ConvToGemmBwdDataTransformD =
+                                TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                              ConvBackwardDataSpecialization,
+                                                              AK1,
+                                                              BK1,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              DoPadGemmM,
+                                                              DoPadGemmN,
+                                                              ALayout,
+                                                              BLayout,
+                                                              DLayout,
+                                                              true, /*SplitConvN*/
+                                                              ABDataType,
+                                                              DDataType>;
+                            ConvToGemmBwdDataTransformD conv_to_gemm_transform_d{
                                 a_g_n_k_wos_lengths,
                                 a_g_n_k_wos_strides,
                                 b_g_k_c_xs_lengths,
                                 b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
+                                ds_g_n_c_wis_lengths[i],
+                                ds_g_n_c_wis_strides[i],
                                 conv_filter_strides,
                                 conv_filter_dilations,
                                 input_left_pads,
                                 input_right_pads,
-                                tildes);
+                                tildes};
+
+                            ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N();
+                        });
+
+                        const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N();
 
                         // desc for problem definition
                         const auto a_grid_desc_m_k =
@@ -628,6 +591,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                     }
                 }
             }
+            // A/B/Ds/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
+
+            compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_k_wos_strides[1] * conv_N_per_block_;
+            compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_c_wis_strides[1] * conv_N_per_block_;
         }
 
         void Print() const
@@ -660,6 +630,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         // tensor descriptor for problem definition
         index_t num_group_;
+        index_t conv_N_per_block_;
         std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_;
         std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_;
         std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
@@ -678,23 +649,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_batch_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_n_;
 
         // element-wise op
         AElementwiseOp a_element_op_;
         BElementwiseOp b_element_op_;
         CDEElementwiseOp cde_element_op_;
 
-        // for checking IsSupportedArgument()
         std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
-        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
         std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
-        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
         std::array<index_t, NDimSpatial> conv_filter_strides_;
-        std::array<index_t, NDimSpatial> conv_filter_dilations_;
         std::array<index_t, NDimSpatial> input_left_pads_;
         std::array<index_t, NDimSpatial> input_right_pads_;
     };
@@ -711,8 +675,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                 arg.Print();
             }
 
-            float ave_time = 0;
+            const index_t gdy = arg.num_group_;
+            const index_t num_workgroups_per_Conv_N =
+                arg.a_g_n_k_wos_lengths_[I1] / arg.conv_N_per_block_;
+            const index_t gdz = num_workgroups_per_Conv_N;
 
+            float ave_time = 0;
             for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
             {
                 if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
@@ -724,9 +692,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                     throw std::runtime_error("wrong! device_op has invalid setting");
                 }
 
-                const index_t grid_size = arg.block_2_etile_map_container_[i].CalculateGridSize(
-                                              arg.e_grid_desc_m_n_container_[i]) *
-                                          arg.num_group_;
+                const index_t gdx = arg.block_2_etile_map_container_[i].CalculateGridSize(
+                    arg.e_grid_desc_m_n_container_[i]);
 
                 const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1);
 
@@ -747,12 +714,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
                         Block2ETileMap,
                         ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
                         has_main_loop>;
 
                     return launch_and_time_kernel(
                         stream_config,
                         kernel,
-                        dim3(grid_size),
+                        dim3(gdx, gdy, gdz),
                         dim3(BlockSize),
                         0,
                         arg.p_a_grid_,
@@ -762,13 +730,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         arg.a_element_op_,
                         arg.b_element_op_,
                         arg.cde_element_op_,
-                        arg.a_g_n_k_wos_lengths_[0], // Group count
                         arg.a_grid_desc_ak0_m_ak1_container_[i],
                         arg.b_grid_desc_bk0_n_bk1_container_[i],
                         arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
                         arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
                         arg.block_2_etile_map_container_[i],
-                        arg.compute_ptr_offset_of_batch_);
+                        arg.compute_ptr_offset_of_batch_,
+                        arg.compute_ptr_offset_of_n_);
                 };
 
                 if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK))
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index 2be0b6681..8df0d885b 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -13,245 +13,614 @@
 namespace ck {
 namespace tensor_operation {
 
-namespace {
 template <
     index_t NDimSpatial,
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization,
+    index_t AK1,
+    index_t BK1,
+    index_t GemmMPerBlock,
+    index_t GemmNPerBlock,
+    index_t GemmKPerBlock,
+    bool DoPadGemmM,
+    bool DoPadGemmN,
     typename ALayout,
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization>
-constexpr auto make_out_grid_desc(const index_t N,
-                                  const index_t Do,
-                                  const index_t Ho,
-                                  const index_t Wo,
-                                  const index_t K,
-                                  const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides)
+    typename BLayout,
+    typename CLayout,
+    bool SplitN              = false,
+    typename ADataType       = float,
+    typename CDataType       = float,
+    index_t NumGroupsToMerge = 1,
+    typename IndexType       = index_t>
+struct TransformConvBwdDataToGemm_v1
 {
-    const auto KStride = Number<1>{};
+    private:
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
 
-    if constexpr(is_same_v<ALayout, tensor_layout::convolution::NHWGK>)
-    {
-        const index_t NStride  = out_g_n_k_wos_strides[1];
-        const index_t HiStride = out_g_n_k_wos_strides[3];
-        const index_t WiStride = out_g_n_k_wos_strides[4];
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
-        {
+    static constexpr auto NonSpatialDimsNum = Number<3>{};
 
-            return make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, K),
-                                                make_tuple(WiStride, KStride));
-        }
-        else
+    static constexpr auto DIdx = NonSpatialDimsNum;
+    static constexpr auto HIdx =
+        NDimSpatial == 2 ? NonSpatialDimsNum : Number<NonSpatialDimsNum + 1>{};
+    static constexpr auto WIdx =
+        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+
+    static constexpr auto ZIdx = NonSpatialDimsNum;
+    static constexpr auto YIdx =
+        NDimSpatial == 2 ? NonSpatialDimsNum : Number<NonSpatialDimsNum + 1>{};
+    static constexpr auto XIdx =
+        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+
+    template <typename ConvDimsType>
+    static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths,
+                                                          const ConvDimsType& strides,
+                                                          index_t i)
+    {
+        long_index_t acc = 1;
+        for(; i < (NDimSpatial + 3); i++)
         {
-            return make_naive_tensor_descriptor(make_tuple(N, Ho, Wo, K),
-                                                make_tuple(NStride, HiStride, WiStride, KStride));
+            acc +=
+                static_cast<long_index_t>(lengths[i] - I1) * static_cast<long_index_t>(strides[i]);
         }
+
+        return acc;
     }
-    else if constexpr(is_same_v<ALayout, tensor_layout::convolution::NDHWGK>)
+
+    template <typename ConvDimsType>
+    static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_k_wos_lengths,
+                                     const ConvDimsType& a_g_n_k_wos_strides,
+                                     const ConvDimsType& c_g_n_c_wis_lengths,
+                                     const ConvDimsType& c_g_n_c_wis_strides)
     {
-        const index_t NStride  = out_g_n_k_wos_strides[1];
-        const index_t DoStride = out_g_n_k_wos_strides[3];
-        const index_t HoStride = out_g_n_k_wos_strides[4];
-        const index_t WoStride = out_g_n_k_wos_strides[5];
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
+        const long_index_t a_element_space_size =
+            calculate_element_space_size_impl(a_g_n_k_wos_lengths, a_g_n_k_wos_strides, I1);
+        const long_index_t c_element_space_size =
+            calculate_element_space_size_impl(c_g_n_c_wis_lengths, c_g_n_c_wis_strides, I1);
+        const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType),
+                                                          c_element_space_size * sizeof(CDataType));
+        constexpr long_index_t TwoGB          = (long_index_t{1} << 31);
+
+        const IndexType N = a_g_n_k_wos_lengths[I1];
+
+        if(element_space_size > TwoGB)
         {
+            // Minimum divisor of N to not exceed 2GB
+            const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB);
 
-            return make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, K),
-                                                make_tuple(WoStride, KStride));
+            if(divisor <= static_cast<double>(N))
+            {
+                // Find least divisor of N larger than element_space_size / TwoGB
+                // Iterate up to sqrt(N). There are no divisors above this value.
+                for(IndexType least_divisor = divisor; least_divisor * least_divisor <= N;
+                    least_divisor++)
+                {
+                    if(N % least_divisor == 0)
+                    {
+                        return N / least_divisor;
+                    }
+                }
+                // Not found, process one Convolution N per block
+                return 1;
+            }
+            else
+            {
+                // Not possible to support even after split N.
+                // Too large tensor.
+                return N;
+            }
         }
         else
         {
-            return make_naive_tensor_descriptor(
-                make_tuple(N, Do, Ho, Wo, K),
-                make_tuple(NStride, DoStride, HoStride, WoStride, KStride));
+            // Split N is not needed.
+            return N;
         }
     }
-    else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+
+    public:
+    __host__ __device__ constexpr TransformConvBwdDataToGemm_v1() {}
+
+    template <typename TransformConvBwdDataToGemm_v1Base>
+    __host__ __device__ TransformConvBwdDataToGemm_v1(
+        const TransformConvBwdDataToGemm_v1Base& transform_conv_bwd_data_to_gemm_base)
+        : N_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.N_)},
+          Di_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Di_)},
+          Hi_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Hi_)},
+          Wi_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Wi_)},
+          Do_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Do_)},
+          Ho_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Ho_)},
+          Wo_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Wo_)},
+          Z_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Z_)},
+          Y_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.Y_)},
+          X_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.X_)},
+          K_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.K_)},
+          C_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.C_)},
+          DiStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.DiStride_)},
+          HiStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.HiStride_)},
+          WiStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WiStride_)},
+          DoStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.DoStride_)},
+          HoStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.HoStride_)},
+          WoStride_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WoStride_)},
+          CStrideTensorB_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.CStrideTensorB_)},
+          CStrideTensorC_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.CStrideTensorC_)},
+          KStrideTensorA_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.KStrideTensorA_)},
+          KStrideTensorB_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.KStrideTensorB_)},
+          NStrideTensorA_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.NStrideTensorA_)},
+          NStrideTensorC_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.NStrideTensorC_)},
+          ConvStrideD_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvStrideD_)},
+          ConvStrideH_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvStrideH_)},
+          ConvStrideW_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvStrideW_)},
+          ConvDilationD_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvDilationD_)},
+          ConvDilationH_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvDilationH_)},
+          ConvDilationW_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ConvDilationW_)},
+          InLeftPadD_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InLeftPadD_)},
+          InLeftPadH_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InLeftPadH_)},
+          InLeftPadW_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InLeftPadW_)},
+          InRightPadD_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InRightPadD_)},
+          InRightPadH_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InRightPadH_)},
+          InRightPadW_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.InRightPadW_)},
+          IdxZTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.IdxZTilde_)},
+          IdxYTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.IdxYTilde_)},
+          IdxXTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.IdxXTilde_)},
+          GcdStrideDilationD_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationD_)},
+          GcdStrideDilationH_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationH_)},
+          GcdStrideDilationW_{
+              static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.GcdStrideDilationW_)},
+          ZTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ZTilde_)},
+          YTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.YTilde_)},
+          XTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XTilde_)},
+          DTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.DTilde_)},
+          HTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.HTilde_)},
+          WTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WTilde_)},
+          ZDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ZDot_)},
+          YDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.YDot_)},
+          XDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XDot_)}
     {
-        // assume packed
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
+    }
+
+    template <typename ConvDimsType, typename ConvSpatialDimsType>
+    __host__ __device__
+    TransformConvBwdDataToGemm_v1(const ConvDimsType& a_g_n_k_wos_lengths,
+                                  const ConvDimsType& a_g_n_k_wos_strides,
+                                  const ConvDimsType& b_g_k_c_xs_lengths,
+                                  const ConvDimsType& b_g_k_c_xs_strides,
+                                  const ConvDimsType& c_g_n_c_wis_lengths,
+                                  const ConvDimsType& c_g_n_c_wis_strides,
+                                  const ConvSpatialDimsType& conv_filter_strides,
+                                  const ConvSpatialDimsType& conv_filter_dilations,
+                                  const ConvSpatialDimsType& input_left_pads,
+                                  const ConvSpatialDimsType& input_right_pads,
+                                  const ConvSpatialDimsType& tildes)
+        : Hi_{c_g_n_c_wis_lengths[HIdx]},
+          Wi_{c_g_n_c_wis_lengths[WIdx]},
+          Ho_{a_g_n_k_wos_lengths[HIdx]},
+          Wo_{a_g_n_k_wos_lengths[WIdx]},
+          Y_{b_g_k_c_xs_lengths[YIdx]},
+          X_{b_g_k_c_xs_lengths[XIdx]},
+          K_{a_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          HiStride_{c_g_n_c_wis_strides[HIdx]},
+          WiStride_{c_g_n_c_wis_strides[WIdx]},
+          HoStride_{a_g_n_k_wos_strides[HIdx]},
+          WoStride_{a_g_n_k_wos_strides[WIdx]},
+          CStrideTensorB_{b_g_k_c_xs_strides[I2]},
+          CStrideTensorC_{c_g_n_c_wis_strides[I2]},
+          KStrideTensorA_{a_g_n_k_wos_strides[I2]},
+          KStrideTensorB_{b_g_k_c_xs_strides[I1]},
+          NStrideTensorA_{a_g_n_k_wos_strides[I1]},
+          NStrideTensorC_{c_g_n_c_wis_strides[I1]},
+          ConvStrideH_{conv_filter_strides[HIdx - NonSpatialDimsNum]},
+          ConvStrideW_{conv_filter_strides[WIdx - NonSpatialDimsNum]},
+          ConvDilationH_{conv_filter_dilations[HIdx - NonSpatialDimsNum]},
+          ConvDilationW_{conv_filter_dilations[WIdx - NonSpatialDimsNum]},
+          InLeftPadH_{input_left_pads[HIdx - NonSpatialDimsNum]},
+          InLeftPadW_{input_left_pads[WIdx - NonSpatialDimsNum]},
+          InRightPadH_{input_right_pads[HIdx - NonSpatialDimsNum]},
+          InRightPadW_{input_right_pads[WIdx - NonSpatialDimsNum]},
+          IdxYTilde_{tildes[YIdx - NonSpatialDimsNum]},
+          IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]}
+    {
+        static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
+        static_assert(is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      is_same_v<ConvDimsType, ck::Array<IndexType, NDimSpatial + I3>>);
+
+        if constexpr(SplitN)
         {
-            return make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+            N_ = GetSplitedNSize(
+                a_g_n_k_wos_lengths, a_g_n_k_wos_strides, c_g_n_c_wis_lengths, c_g_n_c_wis_strides);
         }
         else
         {
-            return make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+            N_ = c_g_n_c_wis_lengths[I1];
         }
-    }
-    else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNDHWK>)
-    {
-        // assume packed
-        if constexpr(ConvBwdDataSpecialization ==
-                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
-                         Filter1x1Stride1Pad0)
+        if constexpr(NDimSpatial == 3)
         {
-            return make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K));
+            Di_                 = c_g_n_c_wis_lengths[DIdx];
+            Do_                 = a_g_n_k_wos_lengths[DIdx];
+            Z_                  = b_g_k_c_xs_lengths[ZIdx];
+            DiStride_           = c_g_n_c_wis_strides[DIdx];
+            DoStride_           = a_g_n_k_wos_strides[DIdx];
+            ConvStrideD_        = conv_filter_strides[DIdx - NonSpatialDimsNum];
+            ConvDilationD_      = conv_filter_dilations[DIdx - NonSpatialDimsNum];
+            InLeftPadD_         = input_left_pads[DIdx - NonSpatialDimsNum];
+            InRightPadD_        = input_right_pads[DIdx - NonSpatialDimsNum];
+            IdxZTilde_          = tildes[ZIdx - NonSpatialDimsNum];
+            GcdStrideDilationD_ = math::gcd(ConvStrideD_, ConvDilationD_);
+            ZTilde_             = ConvStrideD_ / GcdStrideDilationD_;
+            DTilde_ = Do_ + math::integer_divide_ceil(ConvDilationD_ * (Z_ - I1), ConvStrideD_);
+            ZDot_   = math::integer_divide_ceil(Z_, ZTilde_);
         }
         else
         {
-            return make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K));
+            Di_ = Do_ = Z_ = ZTilde_ = ConvStrideD_ = DTilde_ = ZDot_ = 1;
+            InLeftPadD_ = InRightPadD_ = DiStride_ = DoStride_ = IdxZTilde_ = 0;
         }
-    }
-    else
-    {
-        throw std::runtime_error("wrong! unsupported layout: " + ALayout::name());
-    }
-}
 
-template <typename BLayout>
-constexpr auto make_wei_grid_desc(
-    const index_t K, const index_t Z, const index_t Y, const index_t X, const index_t C)
-{
+        GcdStrideDilationH_ = math::gcd(ConvStrideH_, ConvDilationH_);
+        GcdStrideDilationW_ = math::gcd(ConvStrideW_, ConvDilationW_);
 
-    if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC>)
-    {
-        return make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C));
-    }
-    else if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKZYXC>)
-    {
-        return make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C));
-    }
-    else
-    {
-        throw std::runtime_error("wrong! unsupported layout: " + BLayout::name());
-    }
-}
-
-template <index_t NDimSpatial, typename CLayout>
-constexpr auto make_in_grid_desc(const index_t N,
-                                 const index_t Di,
-                                 const index_t Hi,
-                                 const index_t Wi,
-                                 const index_t C,
-                                 const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides)
-{
+        YTilde_ = ConvStrideH_ / GcdStrideDilationH_;
+        XTilde_ = ConvStrideW_ / GcdStrideDilationW_;
 
-    if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNHWC> ||
-                 is_same_v<CLayout, tensor_layout::convolution::NHWGC> ||
-                 is_same_v<CLayout, tensor_layout::convolution::G_NHW_C>)
-    {
-        return make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C),
-                                            make_tuple(in_g_n_c_wis_strides[1],
-                                                       in_g_n_c_wis_strides[3],
-                                                       in_g_n_c_wis_strides[4],
-                                                       in_g_n_c_wis_strides[2]));
+        HTilde_ = Ho_ + math::integer_divide_ceil(ConvDilationH_ * (Y_ - I1), ConvStrideH_);
+        WTilde_ = Wo_ + math::integer_divide_ceil(ConvDilationW_ * (X_ - I1), ConvStrideW_);
+
+        YDot_ = math::integer_divide_ceil(Y_, YTilde_);
+        XDot_ = math::integer_divide_ceil(X_, XTilde_);
     }
-    else if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNDHWC> ||
-                      is_same_v<CLayout, tensor_layout::convolution::NDHWGC>)
+
+#if 0 // At now not supported to split tensor
+    __host__ bool AreDescriptorsSmallerThan2GB() const
     {
-        return make_naive_tensor_descriptor(make_tuple(N, Di, Hi, Wi, C),
-                                            make_tuple(in_g_n_c_wis_strides[1],
-                                                       in_g_n_c_wis_strides[3],
-                                                       in_g_n_c_wis_strides[4],
-                                                       in_g_n_c_wis_strides[5],
-                                                       in_g_n_c_wis_strides[2]));
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        const long_index_t in_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorC_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ +
+            (Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorC_;
+        const long_index_t out_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorA_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ +
+            (Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorA_;
+
+        bool is_a_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(ADataType)) <= TwoGB;
+        bool is_c_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(CDataType)) <= TwoGB;
+
+        return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB;
     }
-    else
+
+    __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   CDataType* c_grid_ptr_base) const
     {
-        throw std::runtime_error("wrong! unsupported layout: " + CLayout::name());
-    }
-}
+        // Create copies
+        auto conv_to_gemm_transformer_left  = *this;
+        auto conv_to_gemm_transformer_right = *this;
+        IndexType a_right_offset            = 0;
+        IndexType c_right_offset            = 0;
+        // Calculate real filter size
+        const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1;
+        const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1;
+        const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1;
+        // Calculate start position in input for right tensor
+        const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_;
+        const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_;
+        const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_;
+        // Calculate last position in input for left tensor
+        const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff;
+        const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff;
+        const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff;
+        // Allow to split if whole left padding will be in left tensor and right padding in right
+        // tensor
+        const bool is_possible_to_split_d = Do_ != 1 &&
+                                            di_right_transformer_start_idx > InLeftPadD_ &&
+                                            di_left_transformer_end_idx <= (InLeftPadD_ + Di_);
+        const bool is_possible_to_split_h = Ho_ != 1 &&
+                                            hi_right_transformer_start_idx > InLeftPadH_ &&
+                                            hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_);
+        const bool is_possible_to_split_w = Wo_ != 1 &&
+                                            wi_right_transformer_start_idx > InLeftPadW_ &&
+                                            wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_);
+
+        if(is_possible_to_split_d)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Do_  = Do_ / 2;
+            conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadD_  = InLeftPadD_;
+            conv_to_gemm_transformer_right.InLeftPadD_ = 0;
+            // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadD_  = 0;
+            conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_;
+            conv_to_gemm_transformer_right.Di_ =
+                math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_),
+                          (conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff);
+            ;
+            // Calcualte offsets
+            a_right_offset = (Do_ / 2) * DoStride_;
+            c_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_;
+        }
+        else if(is_possible_to_split_h)
+        {
+            conv_to_gemm_transformer_left.Ho_  = Ho_ / 2;
+            conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2;
 
-} // namespace
+            conv_to_gemm_transformer_left.InLeftPadH_  = InLeftPadH_;
+            conv_to_gemm_transformer_right.InLeftPadH_ = 0;
 
-template <
-    index_t NDimSpatial,
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization,
-    index_t AK1,
-    index_t BK1,
-    index_t GemmMPerBlock,
-    index_t GemmNPerBlock,
-    index_t GemmKPerBlock,
-    bool DoPadGemmM,
-    bool DoPadGemmN>
-struct TransformConvBwdDataToGemm_v1
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
+            conv_to_gemm_transformer_left.InRightPadH_  = 0;
+            conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
 
-    static constexpr auto NonSpatialDimsNum = Number<3>{};
+            conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_;
+            conv_to_gemm_transformer_right.Hi_ =
+                math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_),
+                          (conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff);
+            a_right_offset = (Ho_ / 2) * HoStride_;
+            c_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_;
+        }
+        else if(is_possible_to_split_w)
+        {
+            conv_to_gemm_transformer_left.Wo_  = Wo_ / 2;
+            conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2;
 
-    static constexpr auto DIdx = Number<NonSpatialDimsNum>{};
-    static constexpr auto HIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum>{} : Number<NonSpatialDimsNum + 1>{};
-    static constexpr auto WIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+            conv_to_gemm_transformer_left.InLeftPadW_  = InLeftPadW_;
+            conv_to_gemm_transformer_right.InLeftPadW_ = 0;
 
-    static constexpr auto ZIdx = Number<NonSpatialDimsNum>{};
-    static constexpr auto YIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum>{} : Number<NonSpatialDimsNum + 1>{};
-    static constexpr auto XIdx =
-        NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{} : Number<NonSpatialDimsNum + 2>{};
+            conv_to_gemm_transformer_left.InRightPadW_  = 0;
+            conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
 
-    template <typename ALayout,
-              typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
-                                          (is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
-                                           is_same_v<ALayout, tensor_layout::convolution::GNDHWK> ||
-                                           is_same_v<ALayout, tensor_layout::convolution::NHWGK> ||
-                                           is_same_v<ALayout, tensor_layout::convolution::NDHWGK>),
-                                      bool>::type = false>
-    static auto MakeADescriptor_AK0_M_AK1(
-        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides,
-        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
-        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-        const std::array<index_t, NDimSpatial>& input_left_pads,
-        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
-        const std::array<index_t, NDimSpatial>& tildes)
+            conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_;
+            conv_to_gemm_transformer_right.Wi_ =
+                math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_),
+                          (conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff);
+
+            a_right_offset = (Wo_ / 2) * WoStride_;
+            c_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_;
+        }
+        // Return left transform, right transformer, right offset to Input and right offset to
+        // Output
+        return ck::make_tuple(conv_to_gemm_transformer_left,
+                              conv_to_gemm_transformer_right,
+                              a_grid_ptr_base + a_right_offset,
+                              c_grid_ptr_base + c_right_offset);
+    }
+
+    __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   CDataType* c_grid_ptr_base) const
     {
-        index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum];
-        index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum];
-        index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum];
+        // Create copies
+        auto conv_to_gemm_transformer_left  = *this;
+        auto conv_to_gemm_transformer_right = *this;
+        IndexType a_right_offset            = 0;
+        IndexType c_right_offset            = 0;
+
+        // Calculate start position in input for right tensor
+        const IndexType do_right_transformer_start_idx = math::integer_divide_ceil((Di_ / 2) + InLeftPadD_ - ((Z_ - 1) * ConvDilationD_), ConvStrideD_);
+        const IndexType ho_right_transformer_start_idx = math::integer_divide_ceil((Hi_ / 2) + InLeftPadH_ - ((Y_ - 1) * ConvDilationH_), ConvStrideH_);
+        const IndexType wo_right_transformer_start_idx = math::integer_divide_ceil((Wi_ / 2) + InLeftPadW_ - ((X_ - 1) * ConvDilationW_), ConvStrideW_);
+        // Calculate last position in input for left tensor
+        const IndexType do_left_transformer_end_idx = math::integer_divide_ceil((Di_ / 2 - 1) + InLeftPadD_, ConvStrideD_);
+        const IndexType ho_left_transformer_end_idx = math::integer_divide_ceil((Hi_ / 2 - 1) + InLeftPadH_, ConvStrideH_);
+        const IndexType wo_left_transformer_end_idx = math::integer_divide_ceil((Wi_ / 2 - 1) + InLeftPadW_, ConvStrideW_);
+
+
+        if(Di_!=1)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Di_  = Di_ / 2;
+            conv_to_gemm_transformer_right.Di_ = Di_ - Di_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadD_  = InLeftPadD_;
+            conv_to_gemm_transformer_right.InLeftPadD_ = 0;
+            // // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadD_  = 0;
+            conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Do_ = do_left_transformer_end_idx;
+            conv_to_gemm_transformer_right.Do_ = Do_ - do_right_transformer_start_idx;
+            ;
+            // Calcualte offsets
+            a_right_offset = do_right_transformer_start_idx * DoStride_;
+            c_right_offset = (Di_ / 2) * DiStride_;
+        }
+        else if(Hi_!=1)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Hi_  = Hi_ / 2;
+            conv_to_gemm_transformer_right.Hi_ = Hi_ - Hi_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadH_  = InLeftPadH_;
+            conv_to_gemm_transformer_right.InLeftPadH_ = 0;
+            // // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadH_  = 0;
+            conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Ho_ = ho_left_transformer_end_idx ;
+            conv_to_gemm_transformer_right.Ho_ = Ho_ - ho_right_transformer_start_idx ;
+            ;
+            // Calcualte offsets
+            a_right_offset = ho_right_transformer_start_idx * HoStride_;
+            c_right_offset = (Hi_ / 2) * HiStride_;
+        }
+        else if(Wi_!=1)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Wi_  = Wi_ / 2;
+            conv_to_gemm_transformer_right.Wi_ = Wi_ - Wi_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadW_  = InLeftPadW_;
+            conv_to_gemm_transformer_right.InLeftPadW_ = 0;
+            // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadW_  = 0;
+            conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Wo_ = wo_left_transformer_end_idx;
+            conv_to_gemm_transformer_right.Wo_ = Wo_ - wo_right_transformer_start_idx;
+            ;
+            // Calcualte offsets
+            a_right_offset = wo_right_transformer_start_idx * WoStride_;
+            c_right_offset = (Wi_ / 2) * WiStride_;
+        }
+        // Return left transform, right transformer, right offset to Input and right offset to
+        // Output
+        return ck::make_tuple(conv_to_gemm_transformer_left,
+                              conv_to_gemm_transformer_right,
+                              a_grid_ptr_base + a_right_offset,
+                              c_grid_ptr_base + c_right_offset);
+    }
+#endif
 
-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t K = wei_g_k_c_xs_lengths[1];
+    __host__ __device__ auto MakeOutGridDesc() const
+    {
+        if constexpr(is_same_v<ALayout, tensor_layout::convolution::NHWGK>)
+        {
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {
 
-        const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1;
-        const index_t Hi = in_g_n_c_wis_lengths[HIdx];
-        const index_t Wi = in_g_n_c_wis_lengths[WIdx];
+                return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_),
+                                                    make_tuple(WoStride_, KStrideTensorA_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(N_, Ho_, Wo_, K_),
+                    make_tuple(NStrideTensorA_, HoStride_, WoStride_, KStrideTensorA_));
+            }
+        }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::NDHWGK>)
+        {
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {
 
-        const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1;
-        const index_t Ho = out_g_n_k_wos_lengths[HIdx];
-        const index_t Wo = out_g_n_k_wos_lengths[WIdx];
+                return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_),
+                                                    make_tuple(WoStride_, KStrideTensorA_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(N_, Do_, Ho_, Wo_, K_),
+                    make_tuple(NStrideTensorA_, DoStride_, HoStride_, WoStride_, KStrideTensorA_));
+            }
+        }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+        {
+            // assume packed
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_ * Ho_ * Wo_, K_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_, Ho_, Wo_, K_));
+            }
+        }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNDHWK>)
+        {
+            // assume packed
+            if constexpr(ConvBwdDataSpecialization ==
+                         ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                             Filter1x1Stride1Pad0)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_ * Do_ * Ho_ * Wo_, K_));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(N_, Do_, Ho_, Wo_, K_));
+            }
+        }
+        else
+        {
+            throw std::runtime_error("wrong! unsupported layout: " + ALayout::name());
+        }
+    }
 
-        const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1;
-        const index_t Y = wei_g_k_c_xs_lengths[YIdx];
-        const index_t X = wei_g_k_c_xs_lengths[XIdx];
+    __host__ __device__ auto MakeWeiGridDesc() const
+    {
 
-        const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum];
-        const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum];
-        const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum];
+        if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC>)
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_));
+        }
+        else if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKZYXC>)
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_));
+        }
+        else
+        {
+            throw std::runtime_error("wrong! unsupported layout: " + BLayout::name());
+        }
+    }
 
-        const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum];
-        const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum];
-        const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum];
+    __host__ __device__ auto MakeInGridDesc() const
+    {
 
-        const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum];
-        const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum];
-        const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum];
+        if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNHWC> ||
+                     is_same_v<CLayout, tensor_layout::convolution::NHWGC> ||
+                     is_same_v<CLayout, tensor_layout::convolution::G_NHW_C>)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N_, Hi_, Wi_, C_),
+                make_tuple(NStrideTensorC_, HiStride_, WiStride_, CStrideTensorC_));
+        }
+        else if constexpr(is_same_v<CLayout, tensor_layout::convolution::GNDHWC> ||
+                          is_same_v<CLayout, tensor_layout::convolution::NDHWGC>)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N_, Di_, Hi_, Wi_, C_),
+                make_tuple(NStrideTensorC_, DiStride_, HiStride_, WiStride_, CStrideTensorC_));
+        }
+        else
+        {
+            throw std::runtime_error("wrong! unsupported layout: " + CLayout::name());
+        }
+    }
 
+    template <
+        typename ALayout_                   = ALayout,
+        typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
+                                    (is_same_v<ALayout_, tensor_layout::convolution::GNHWK> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::GNDHWK> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::NHWGK> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::NDHWGK>),
+                                bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_AK0_M_AK1() const
+    {
         // n_do_ho_wo_k for 3d or n_ho_wo_k for 2d
-        const auto out_grid_desc =
-            make_out_grid_desc<NDimSpatial, ALayout, ConvBwdDataSpecialization>(
-                N, Do, Ho, Wo, K, out_g_n_k_wos_strides);
+        const auto out_grid_desc = MakeOutGridDesc();
 
         if constexpr(ConvBwdDataSpecialization ==
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
-            const index_t AK0 = math::integer_divide_ceil(K, AK1);
+            const index_t AK0 = math::integer_divide_ceil(K_, AK1);
 
             // A: output tensor
             const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor(
                 out_grid_desc,
-                make_tuple(make_pass_through_transform(N * Do * Ho * Wo),
+                make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_),
                            make_unmerge_transform(make_tuple(AK0, AK1))),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
@@ -266,82 +635,63 @@ struct TransformConvBwdDataToGemm_v1
         }
         else
         {
-            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto ZDot = math::integer_divide_ceil(Z, ZTilde);
-            const auto YDot = math::integer_divide_ceil(Y, YTilde);
-            const auto XDot = math::integer_divide_ceil(X, XTilde);
-
-            const auto DTilde =
-                Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD);
-            const auto HTilde =
-                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-            const auto WTilde =
-                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
             // only work on HTilde and WTilde that contribute to non-padding area of input tensor
             const auto IDTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD);
+                math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_);
             const auto IHTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+                math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
             const auto IWTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+                math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);
 
             const auto IDTildeSliceEnd = math::min(
-                DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1);
+                DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1);
             const auto IHTildeSliceEnd = math::min(
-                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+                HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
             const auto IWTildeSliceEnd = math::min(
-                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+                WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);
 
             const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin;
             const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
             const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
 
             // GemmK is different for each GEMM
-            const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
-            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
-            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+            const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_);
+            const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_);
+            const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_);
 
             if constexpr(NDimSpatial == 2)
             {
                 // A: output tensor
                 const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
                     out_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Ho, I0, I0),
-                               make_pad_transform(Wo, I0, I0),
-                               make_pass_through_transform(K)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Ho_, I0, I0),
+                               make_pad_transform(Wo_, I0, I0),
+                               make_pass_through_transform(K_)),
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
                 const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
                     out_n_hop_wop_k_grid_desc,
                     make_tuple(
-                        make_pass_through_transform(N),
-                        make_embed_transform(make_tuple(YDot, HTilde),
-                                             make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
-                        make_embed_transform(make_tuple(XDot, WTilde),
-                                             make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
-                        make_pass_through_transform(K)),
+                        make_pass_through_transform(N_),
+                        make_embed_transform(make_tuple(YDot_, HTilde_),
+                                             make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)),
+                        make_embed_transform(make_tuple(XDot_, WTilde_),
+                                             make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)),
+                        make_pass_through_transform(K_)),
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                     make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
                 const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc =
                     transform_tensor_descriptor(
                         out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_slice_transform(YDot, I0, YDotSlice),
-                                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                                   make_slice_transform(XDot, I0, XDotSlice),
-                                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                                   make_pass_through_transform(K)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_slice_transform(YDot_, I0, YDotSlice),
+                                   make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                                   make_slice_transform(XDot_, I0, XDotSlice),
+                                   make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                                   make_pass_through_transform(K_)),
                         make_tuple(Sequence<0>{},
                                    Sequence<1>{},
                                    Sequence<2>{},
@@ -357,8 +707,8 @@ struct TransformConvBwdDataToGemm_v1
 
                 const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor(
                     out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)),
-                               make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice))),
+                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)),
+                               make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice))),
                     make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}));
 
@@ -385,11 +735,11 @@ struct TransformConvBwdDataToGemm_v1
                 // A: output tensor
                 const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
                     out_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Do, I0, I0),
-                               make_pad_transform(Ho, I0, I0),
-                               make_pad_transform(Wo, I0, I0),
-                               make_pass_through_transform(K)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Do_, I0, I0),
+                               make_pad_transform(Ho_, I0, I0),
+                               make_pad_transform(Wo_, I0, I0),
+                               make_pass_through_transform(K_)),
                     make_tuple(
                         Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
                     make_tuple(
@@ -398,17 +748,17 @@ struct TransformConvBwdDataToGemm_v1
                 const auto out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc =
                     transform_tensor_descriptor(
                         out_n_hop_wop_k_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
+                        make_tuple(make_pass_through_transform(N_),
                                    make_embed_transform(
-                                       make_tuple(ZDot, DTilde),
-                                       make_tuple(-ConvDilationD / GcdStrideDilationD, I1)),
+                                       make_tuple(ZDot_, DTilde_),
+                                       make_tuple(-ConvDilationD_ / GcdStrideDilationD_, I1)),
                                    make_embed_transform(
-                                       make_tuple(YDot, HTilde),
-                                       make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                                       make_tuple(YDot_, HTilde_),
+                                       make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)),
                                    make_embed_transform(
-                                       make_tuple(XDot, WTilde),
-                                       make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
-                                   make_pass_through_transform(K)),
+                                       make_tuple(XDot_, WTilde_),
+                                       make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)),
+                                   make_pass_through_transform(K_)),
                         make_tuple(Sequence<0>{},
                                    Sequence<1>{},
                                    Sequence<2>{},
@@ -424,14 +774,15 @@ struct TransformConvBwdDataToGemm_v1
                     out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc =
                         transform_tensor_descriptor(
                             out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc,
-                            make_tuple(make_pass_through_transform(N),
-                                       make_slice_transform(ZDot, I0, ZDotSlice),
-                                       make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
-                                       make_slice_transform(YDot, I0, YDotSlice),
-                                       make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                                       make_slice_transform(XDot, I0, XDotSlice),
-                                       make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                                       make_pass_through_transform(K)),
+                            make_tuple(
+                                make_pass_through_transform(N_),
+                                make_slice_transform(ZDot_, I0, ZDotSlice),
+                                make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice),
+                                make_slice_transform(YDot_, I0, YDotSlice),
+                                make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                                make_slice_transform(XDot_, I0, XDotSlice),
+                                make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                                make_pass_through_transform(K_)),
                             make_tuple(Sequence<0>{},
                                        Sequence<1>{},
                                        Sequence<2>{},
@@ -452,8 +803,9 @@ struct TransformConvBwdDataToGemm_v1
                 const auto out_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor(
                     out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc,
                     make_tuple(
-                        make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)),
-                        make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice))),
+                        make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)),
+                        make_merge_transform(
+                            make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice))),
                     make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}));
 
@@ -482,66 +834,31 @@ struct TransformConvBwdDataToGemm_v1
         }
     }
 
-    template <typename BLayout,
+    template <typename BLayout_                   = BLayout,
               typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
-                                          (is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
-                                           is_same_v<BLayout, tensor_layout::convolution::GKZYXC>),
+                                          (is_same_v<BLayout_, tensor_layout::convolution::GKYXC> ||
+                                           is_same_v<BLayout_, tensor_layout::convolution::GKZYXC>),
                                       bool>::type = false>
-    static auto MakeBDescriptor_BK0_N_BK1(
-        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
-        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
-        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-        const std::array<index_t, NDimSpatial>& /* input_left_pads */,
-        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
-        const std::array<index_t, NDimSpatial>& tildes)
+    __host__ __device__ auto MakeBDescriptor_BK0_N_BK1() const
     {
-        index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum];
-        index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum];
-        index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum];
-
-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t K = wei_g_k_c_xs_lengths[1];
-        const index_t C = wei_g_k_c_xs_lengths[2];
-
-        const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1;
-        const index_t Ho = out_g_n_k_wos_lengths[HIdx];
-        const index_t Wo = out_g_n_k_wos_lengths[WIdx];
-
-        const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1;
-        const index_t Y = wei_g_k_c_xs_lengths[YIdx];
-        const index_t X = wei_g_k_c_xs_lengths[XIdx];
-
-        const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum];
-        const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum];
-        const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum];
-
-        const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum];
-        const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum];
-        const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum];
-
         // assume packed
         // k_y_x_c for 2d or k_z_y_x_c for 3d
-        const auto wei_grid_desc = make_wei_grid_desc<BLayout>(K, Z, Y, X, C);
+        const auto wei_grid_desc = MakeWeiGridDesc();
 
         if constexpr(ConvBwdDataSpecialization ==
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
-            const index_t BK0 = math::integer_divide_ceil(K, BK1);
+            const index_t BK0 = math::integer_divide_ceil(K_, BK1);
 
             // B: weight tensor
             const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc =
-                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K_, C_)),
                                             make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(C)),
+                                                       make_pass_through_transform(C_)),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-            make_naive_tensor_descriptor(make_tuple(N * Do * Ho * Wo, C), make_tuple(I0, I1));
+            make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, C_), make_tuple(I0, I1));
 
             const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc =
                 ck::tensor_operation::device::PadTensorDescriptor(
@@ -553,22 +870,10 @@ struct TransformConvBwdDataToGemm_v1
         }
         else
         {
-            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto ZDot = math::integer_divide_ceil(Z, ZTilde);
-            const auto YDot = math::integer_divide_ceil(Y, YTilde);
-            const auto XDot = math::integer_divide_ceil(X, XTilde);
-
             // GemmK is different for each GEMM
-            const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
-            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
-            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+            const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_);
+            const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_);
+            const auto XDotSlice = math::integer_divide_ceil(X_ - IdxXTilde_, XTilde_);
 
             // B weight tensor
             if constexpr(NDimSpatial == 2)
@@ -576,23 +881,23 @@ struct TransformConvBwdDataToGemm_v1
                 const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
                     wei_grid_desc,
                     make_tuple(
-                        make_pass_through_transform(K),
-                        make_embed_transform(make_tuple(YDot, YTilde),
-                                             make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
-                        make_embed_transform(make_tuple(XDot, XTilde),
-                                             make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
-                        make_pass_through_transform(C)),
+                        make_pass_through_transform(K_),
+                        make_embed_transform(make_tuple(YDot_, YTilde_),
+                                             make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)),
+                        make_embed_transform(make_tuple(XDot_, XTilde_),
+                                             make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)),
+                        make_pass_through_transform(C_)),
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                     make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
                 const auto wei_k_ydotslice_xdotslice_c_grid_desc = transform_tensor_descriptor(
                     wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
-                    make_tuple(make_pass_through_transform(K),
-                               make_slice_transform(YDot, I0, YDotSlice),
-                               make_slice_transform(XDot, I0, XDotSlice),
-                               make_freeze_transform(i_ytilde),
-                               make_freeze_transform(i_xtilde),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(K_),
+                               make_slice_transform(YDot_, I0, YDotSlice),
+                               make_slice_transform(XDot_, I0, XDotSlice),
+                               make_freeze_transform(IdxYTilde_),
+                               make_freeze_transform(IdxXTilde_),
+                               make_pass_through_transform(C_)),
                     make_tuple(Sequence<0>{},
                                Sequence<1>{},
                                Sequence<3>{},
@@ -608,8 +913,8 @@ struct TransformConvBwdDataToGemm_v1
 
                 const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor(
                     wei_k_ydotslice_xdotslice_c_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K)),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)),
+                               make_pass_through_transform(C_)),
                     make_tuple(Sequence<1, 2, 0>{}, Sequence<3>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}));
 
@@ -636,15 +941,17 @@ struct TransformConvBwdDataToGemm_v1
                 const auto wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc =
                     transform_tensor_descriptor(
                         wei_grid_desc,
-                        make_tuple(
-                            make_pass_through_transform(K),
-                            make_embed_transform(make_tuple(ZDot, ZTilde),
-                                                 make_tuple(ConvStrideD / GcdStrideDilationD, I1)),
-                            make_embed_transform(make_tuple(YDot, YTilde),
-                                                 make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
-                            make_embed_transform(make_tuple(XDot, XTilde),
-                                                 make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
-                            make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(K_),
+                                   make_embed_transform(
+                                       make_tuple(ZDot_, ZTilde_),
+                                       make_tuple(ConvStrideD_ / GcdStrideDilationD_, I1)),
+                                   make_embed_transform(
+                                       make_tuple(YDot_, YTilde_),
+                                       make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)),
+                                   make_embed_transform(
+                                       make_tuple(XDot_, XTilde_),
+                                       make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)),
+                                   make_pass_through_transform(C_)),
                         make_tuple(Sequence<0>{},
                                    Sequence<1>{},
                                    Sequence<2>{},
@@ -659,14 +966,14 @@ struct TransformConvBwdDataToGemm_v1
                 const auto wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc =
                     transform_tensor_descriptor(
                         wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc,
-                        make_tuple(make_pass_through_transform(K),
-                                   make_slice_transform(ZDot, I0, ZDotSlice),
-                                   make_slice_transform(YDot, I0, YDotSlice),
-                                   make_slice_transform(XDot, I0, XDotSlice),
-                                   make_freeze_transform(i_ztilde),
-                                   make_freeze_transform(i_ytilde),
-                                   make_freeze_transform(i_xtilde),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(K_),
+                                   make_slice_transform(ZDot_, I0, ZDotSlice),
+                                   make_slice_transform(YDot_, I0, YDotSlice),
+                                   make_slice_transform(XDot_, I0, XDotSlice),
+                                   make_freeze_transform(IdxZTilde_),
+                                   make_freeze_transform(IdxYTilde_),
+                                   make_freeze_transform(IdxXTilde_),
+                                   make_pass_through_transform(C_)),
                         make_tuple(Sequence<0>{},
                                    Sequence<1>{},
                                    Sequence<3>{},
@@ -686,8 +993,9 @@ struct TransformConvBwdDataToGemm_v1
 
                 const auto wei_gemmk_gemmnraw_grid_desc = transform_tensor_descriptor(
                     wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K)),
-                               make_pass_through_transform(C)),
+                    make_tuple(
+                        make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)),
+                        make_pass_through_transform(C_)),
                     make_tuple(Sequence<1, 2, 3, 0>{}, Sequence<4>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}));
 
@@ -716,66 +1024,20 @@ struct TransformConvBwdDataToGemm_v1
         }
     }
 
-    template <typename CLayout,
-              typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
-                                          (is_same_v<CLayout, tensor_layout::convolution::GNHWC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::GNDHWC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::NHWGC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::NDHWGC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::G_NHW_C>),
-                                      bool>::type = false>
-    static auto
-    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads,
-                        const std::array<index_t, NDimSpatial>& tildes)
+    template <
+        typename CLayout_                   = CLayout,
+        typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
+                                    (is_same_v<CLayout_, tensor_layout::convolution::GNHWC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::GNDHWC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::NHWGC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::NDHWGC> ||
+                                     is_same_v<CLayout_, tensor_layout::convolution::G_NHW_C>),
+                                bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
     {
-        index_t i_ztilde = tildes[ZIdx - NonSpatialDimsNum];
-        index_t i_ytilde = tildes[YIdx - NonSpatialDimsNum];
-        index_t i_xtilde = tildes[XIdx - NonSpatialDimsNum];
-
-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t C = wei_g_k_c_xs_lengths[2];
-
-        const index_t Di = NDimSpatial == 3 ? in_g_n_c_wis_lengths[DIdx] : 1;
-        const index_t Hi = in_g_n_c_wis_lengths[HIdx];
-        const index_t Wi = in_g_n_c_wis_lengths[WIdx];
-
-        const index_t Do = NDimSpatial == 3 ? out_g_n_k_wos_lengths[DIdx] : 1;
-        const index_t Ho = out_g_n_k_wos_lengths[HIdx];
-        const index_t Wo = out_g_n_k_wos_lengths[WIdx];
-
-        const index_t Z = NDimSpatial == 3 ? wei_g_k_c_xs_lengths[ZIdx] : 1;
-        const index_t Y = wei_g_k_c_xs_lengths[YIdx];
-        const index_t X = wei_g_k_c_xs_lengths[XIdx];
-
-        const index_t InLeftPadD = input_left_pads[DIdx - NonSpatialDimsNum];
-        const index_t InLeftPadH = input_left_pads[HIdx - NonSpatialDimsNum];
-        const index_t InLeftPadW = input_left_pads[WIdx - NonSpatialDimsNum];
-
-        const index_t InRightPadD = input_right_pads[DIdx - NonSpatialDimsNum];
-        const index_t InRightPadH = input_right_pads[HIdx - NonSpatialDimsNum];
-        const index_t InRightPadW = input_right_pads[WIdx - NonSpatialDimsNum];
-
-        const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum];
-        const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum];
-        const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum];
-
-        const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum];
-        const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum];
-        const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum];
-
         // assume strided
         // n_hi_wi_c for 2d n_di_hi_wi_c for 3d
-        const auto in_grid_desc =
-            make_in_grid_desc<NDimSpatial, CLayout>(N, Di, Hi, Wi, C, in_g_n_c_wis_strides);
+        const auto in_grid_desc = MakeInGridDesc();
 
         if constexpr(ConvBwdDataSpecialization ==
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
@@ -787,10 +1049,10 @@ struct TransformConvBwdDataToGemm_v1
                 const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
                     in_grid_desc,
                     make_tuple(
-                        make_pass_through_transform(N),
-                        make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
-                        make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
-                        make_pass_through_transform(C)),
+                        make_pass_through_transform(N_),
+                        make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)),
+                        make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)),
+                        make_pass_through_transform(C_)),
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                     make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
@@ -798,8 +1060,8 @@ struct TransformConvBwdDataToGemm_v1
                     in_n_y_ho_x_wo_c_grid_desc,
                     make_tuple(make_freeze_transform(I0),
                                make_freeze_transform(I0),
-                               make_merge_transform(make_tuple(N, Ho, Wo)),
-                               make_pass_through_transform(C)),
+                               make_merge_transform(make_tuple(N_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
                     make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
                     make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
 
@@ -818,11 +1080,11 @@ struct TransformConvBwdDataToGemm_v1
                 const auto in_n_x_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
                     in_grid_desc,
                     make_tuple(
-                        make_pass_through_transform(N),
-                        make_embed_transform(make_tuple(I1, Do), make_tuple(I1, ConvStrideD)),
-                        make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
-                        make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
-                        make_pass_through_transform(C)),
+                        make_pass_through_transform(N_),
+                        make_embed_transform(make_tuple(I1, Do_), make_tuple(I1, ConvStrideD_)),
+                        make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)),
+                        make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)),
+                        make_pass_through_transform(C_)),
                     make_tuple(
                         Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
                     make_tuple(Sequence<0>{},
@@ -836,8 +1098,8 @@ struct TransformConvBwdDataToGemm_v1
                     make_tuple(make_freeze_transform(I0),
                                make_freeze_transform(I0),
                                make_freeze_transform(I0),
-                               make_merge_transform(make_tuple(N, Do, Ho, Wo)),
-                               make_pass_through_transform(C)),
+                               make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
                     make_tuple(Sequence<1>{},
                                Sequence<3>{},
                                Sequence<5>{},
@@ -861,36 +1123,21 @@ struct TransformConvBwdDataToGemm_v1
         }
         else
         {
-            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto DTilde =
-                Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD);
-            const auto HTilde =
-                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-            const auto WTilde =
-                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
             // only work on DTilde, HTilde and WTilde that contribute to
             // non-padding area of input tensor
             const auto IDTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD);
+                math::max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_);
             const auto IHTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+                math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
             const auto IWTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+                math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);
 
             const auto IDTildeSliceEnd = math::min(
-                DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1);
+                DTilde_, math::integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1);
             const auto IHTildeSliceEnd = math::min(
-                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+                HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
             const auto IWTildeSliceEnd = math::min(
-                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+                WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);
 
             const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin;
             const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
@@ -901,34 +1148,34 @@ struct TransformConvBwdDataToGemm_v1
             {
                 const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
                     in_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(C_)),
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
                 const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc =
                     transform_tensor_descriptor(
                         in_n_hip_wip_c_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_embed_transform(make_tuple(YTilde, HTilde),
-                                                        make_tuple(ConvDilationH, ConvStrideH)),
-                                   make_embed_transform(make_tuple(XTilde, WTilde),
-                                                        make_tuple(ConvDilationW, ConvStrideW)),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_embed_transform(make_tuple(YTilde_, HTilde_),
+                                                        make_tuple(ConvDilationH_, ConvStrideH_)),
+                                   make_embed_transform(make_tuple(XTilde_, WTilde_),
+                                                        make_tuple(ConvDilationW_, ConvStrideW_)),
+                                   make_pass_through_transform(C_)),
                         make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                         make_tuple(
                             Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
 
                 const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
                     in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_freeze_transform(i_ytilde),
-                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                               make_freeze_transform(i_xtilde),
-                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_freeze_transform(IdxYTilde_),
+                               make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                               make_freeze_transform(IdxXTilde_),
+                               make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                               make_pass_through_transform(C_)),
                     make_tuple(Sequence<0>{},
                                Sequence<1>{},
                                Sequence<2>{},
@@ -944,8 +1191,8 @@ struct TransformConvBwdDataToGemm_v1
 
                 const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
                     in_n_htildeslice_wtildeslice_c_grid_desc,
-                    make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice)),
+                               make_pass_through_transform(C_)),
                     make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}));
 
@@ -961,11 +1208,11 @@ struct TransformConvBwdDataToGemm_v1
             {
                 const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor(
                     in_grid_desc,
-                    make_tuple(make_pass_through_transform(N),
-                               make_pad_transform(Di, InLeftPadD, InRightPadD),
-                               make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                               make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                               make_pass_through_transform(C)),
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(C_)),
                     make_tuple(
                         Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
                     make_tuple(
@@ -974,14 +1221,14 @@ struct TransformConvBwdDataToGemm_v1
                 const auto in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc =
                     transform_tensor_descriptor(
                         in_n_dip_hip_wip_c_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_embed_transform(make_tuple(ZTilde, DTilde),
-                                                        make_tuple(ConvDilationD, ConvStrideD)),
-                                   make_embed_transform(make_tuple(YTilde, HTilde),
-                                                        make_tuple(ConvDilationH, ConvStrideH)),
-                                   make_embed_transform(make_tuple(XTilde, WTilde),
-                                                        make_tuple(ConvDilationW, ConvStrideW)),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_embed_transform(make_tuple(ZTilde_, DTilde_),
+                                                        make_tuple(ConvDilationD_, ConvStrideD_)),
+                                   make_embed_transform(make_tuple(YTilde_, HTilde_),
+                                                        make_tuple(ConvDilationH_, ConvStrideH_)),
+                                   make_embed_transform(make_tuple(XTilde_, WTilde_),
+                                                        make_tuple(ConvDilationW_, ConvStrideW_)),
+                                   make_pass_through_transform(C_)),
                         make_tuple(Sequence<0>{},
                                    Sequence<1>{},
                                    Sequence<2>{},
@@ -996,14 +1243,14 @@ struct TransformConvBwdDataToGemm_v1
                 const auto in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc =
                     transform_tensor_descriptor(
                         in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc,
-                        make_tuple(make_pass_through_transform(N),
-                                   make_freeze_transform(i_ztilde),
-                                   make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
-                                   make_freeze_transform(i_ytilde),
-                                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
-                                   make_freeze_transform(i_xtilde),
-                                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
-                                   make_pass_through_transform(C)),
+                        make_tuple(make_pass_through_transform(N_),
+                                   make_freeze_transform(IdxZTilde_),
+                                   make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice),
+                                   make_freeze_transform(IdxYTilde_),
+                                   make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                                   make_freeze_transform(IdxXTilde_),
+                                   make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                                   make_pass_through_transform(C_)),
                         make_tuple(Sequence<0>{},
                                    Sequence<1>{},
                                    Sequence<2>{},
@@ -1024,8 +1271,8 @@ struct TransformConvBwdDataToGemm_v1
                 const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
                     in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc,
                     make_tuple(
-                        make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)),
-                        make_pass_through_transform(C)),
+                        make_merge_transform(make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice)),
+                        make_pass_through_transform(C_)),
                     make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
                     make_tuple(Sequence<0>{}, Sequence<1>{}));
 
@@ -1044,84 +1291,41 @@ struct TransformConvBwdDataToGemm_v1
     }
 
     // for input bias
-    template <typename CLayout,
+    template <typename CLayout_                   = CLayout,
               typename std::enable_if<NDimSpatial == 2 &&
-                                          (is_same_v<CLayout, tensor_layout::convolution::GC> ||
-                                           is_same_v<CLayout, tensor_layout::convolution::G_C>),
+                                          (is_same_v<CLayout_, tensor_layout::convolution::GC> ||
+                                           is_same_v<CLayout_, tensor_layout::convolution::G_C>),
                                       bool>::type = false>
-    static auto
-    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
-                        const std::array<index_t, NDimSpatial>& /* tildes */)
+    __host__ __device__ auto MakeCDescriptor_M_N() const
     {
-        const index_t N = in_g_n_c_wis_lengths[1];
-        const index_t C = wei_g_k_c_xs_lengths[2];
-
-        const index_t Hi = in_g_n_c_wis_lengths[3];
-        const index_t Wi = in_g_n_c_wis_lengths[4];
-
-        const index_t Ho = out_g_n_k_wos_lengths[3];
-        const index_t Wo = out_g_n_k_wos_lengths[4];
-
-        const index_t Y = wei_g_k_c_xs_lengths[3];
-        const index_t X = wei_g_k_c_xs_lengths[4];
-
-        const index_t InLeftPadH = input_left_pads[0];
-        const index_t InLeftPadW = input_left_pads[1];
-
-        const index_t ConvStrideH = conv_filter_strides[0];
-        const index_t ConvStrideW = conv_filter_strides[1];
-
-        const index_t ConvDilationH = conv_filter_dilations[0];
-        const index_t ConvDilationW = conv_filter_dilations[1];
-
         if constexpr(ConvBwdDataSpecialization ==
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
             const auto in_gemmm_gemmn_grid_desc =
-                make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, C), make_tuple(I0, I1));
+                make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, C_), make_tuple(I0, I1));
 
             return in_gemmm_gemmn_grid_desc;
         }
         else
         {
-            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
-            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
-
-            const auto YTilde = ConvStrideH / GcdStrideDilationH;
-            const auto XTilde = ConvStrideW / GcdStrideDilationW;
-
-            const auto HTilde =
-                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
-            const auto WTilde =
-                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
-
             // only work on HTilde and WTilde that contribute to non-padding area of input tensor
             const auto IHTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+                math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
             const auto IWTildeSliceBegin = math::integer_divide_floor(
-                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+                math::max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);
 
             const auto IHTildeSliceEnd = math::min(
-                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+                HTilde_, math::integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
             const auto IWTildeSliceEnd = math::min(
-                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+                WTilde_, math::integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);
 
             const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
             const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
 
             // bias tensor
             const auto in_gemmmraw_gemmnraw_grid_desc = make_naive_tensor_descriptor(
-                make_tuple(N * HTildeSlice * WTildeSlice, C), make_tuple(I0, I1));
+                make_tuple(N_ * HTildeSlice * WTildeSlice, C_), make_tuple(I0, I1));
 
             const auto in_gemmm_gemmn_grid_desc = ck::tensor_operation::device::PadTensorDescriptor(
                 in_gemmmraw_gemmnraw_grid_desc,
@@ -1131,6 +1335,25 @@ struct TransformConvBwdDataToGemm_v1
             return in_gemmm_gemmn_grid_desc;
         }
     }
+
+    IndexType N_;
+    IndexType Di_, Hi_, Wi_;
+    IndexType Do_, Ho_, Wo_;
+    IndexType Z_, Y_, X_;
+    IndexType K_, C_;
+    IndexType DiStride_, HiStride_, WiStride_;
+    IndexType DoStride_, HoStride_, WoStride_;
+    IndexType CStrideTensorB_, CStrideTensorC_, KStrideTensorA_, KStrideTensorB_;
+    IndexType NStrideTensorA_, NStrideTensorC_;
+    IndexType ConvStrideD_, ConvStrideH_, ConvStrideW_;
+    IndexType ConvDilationD_, ConvDilationH_, ConvDilationW_;
+    IndexType InLeftPadD_, InLeftPadH_, InLeftPadW_;
+    IndexType InRightPadD_, InRightPadH_, InRightPadW_;
+    IndexType IdxZTilde_, IdxYTilde_, IdxXTilde_;
+    IndexType GcdStrideDilationD_, GcdStrideDilationH_, GcdStrideDilationW_;
+    IndexType ZTilde_, YTilde_, XTilde_;
+    IndexType DTilde_, HTilde_, WTilde_;
+    IndexType ZDot_, YDot_, XDot_;
 };
 
 } // namespace tensor_operation
diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt
index 8edb71520..6d78da8db 100644
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -1,6 +1,10 @@
-add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_xdl_wmma.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_data_xdl.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+    target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+endif()
+add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_grouped_convnd_bwd_data_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
 add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp)
 if(result EQUAL 0)
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
new file mode 100644
index 000000000..7ad7b78d6
--- /dev/null
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataWmma : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using OutLayout = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using InLayout  = std::tuple_element_t<3, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                            OutLayout,
+                                                                            WeiLayout,
+                                                                            InLayout,
+                                                                            DataType,
+                                                                            DataType,
+                                                                            DataType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<int8_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
+                                       std::tuple<int8_t, NHWGK, GKYXC, NHWGC>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<int8_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
+                                       std::tuple<int8_t, NDHWGK, GKZYXC, NDHWGC>>;
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataWmma2d : public TestGroupedConvndBwdDataWmma<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataWmma3d : public TestGroupedConvndBwdDataWmma<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndBwdDataWmma2d, Test2D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
similarity index 78%
rename from test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp
rename to test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index 96506b876..fdc8fb64e 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_wmma.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -12,7 +12,7 @@
 #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
 
 template <typename Tuple>
-class TestGroupedConvndBwdData : public ::testing::Test
+class TestGroupedConvndBwdDataXdl : public ::testing::Test
 {
     protected:
     using DataType  = std::tuple_element_t<0, Tuple>;
@@ -51,35 +51,31 @@ using namespace ck::tensor_layout::convolution;
 using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
                                        std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
                                        std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
-                                       std::tuple<int8_t, GNHWK, GKYXC, GNHWC>,
                                        std::tuple<float, NHWGK, GKYXC, NHWGC>,
                                        std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
-                                       std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>,
-                                       std::tuple<int8_t, NHWGK, GKYXC, NHWGC>>;
+                                       std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
 
 using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
                                        std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
                                        std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
-                                       std::tuple<int8_t, GNDHWK, GKZYXC, GNDHWC>,
                                        std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
                                        std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
-                                       std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>,
-                                       std::tuple<int8_t, NDHWGK, GKZYXC, NDHWGC>>;
+                                       std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;
 
 template <typename Tuple>
-class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData<Tuple>
+class TestGroupedConvndBwdDataXdl2d : public TestGroupedConvndBwdDataXdl<Tuple>
 {
 };
 
 template <typename Tuple>
-class TestGroupedConvndBwdData3d : public TestGroupedConvndBwdData<Tuple>
+class TestGroupedConvndBwdDataXdl3d : public TestGroupedConvndBwdDataXdl<Tuple>
 {
 };
 
-TYPED_TEST_SUITE(TestGroupedConvndBwdData2d, KernelTypes2d);
-TYPED_TEST_SUITE(TestGroupedConvndBwdData3d, KernelTypes3d);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl3d, KernelTypes3d);
 
-TYPED_TEST(TestGroupedConvndBwdData2d, Test2D)
+TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
 {
     this->conv_params.clear();
 
@@ -94,10 +90,13 @@ TYPED_TEST(TestGroupedConvndBwdData2d, Test2D)
     this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    // SplitN case
+    this->conv_params.push_back(
+        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
     this->template Run<2>();
 }
 
-TYPED_TEST(TestGroupedConvndBwdData3d, Test3D)
+TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
 {
     this->conv_params.clear();
     this->conv_params.push_back(
@@ -112,5 +111,17 @@ TYPED_TEST(TestGroupedConvndBwdData3d, Test3D)
         {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
         {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    // SplitN case
+    this->conv_params.push_back({3,
+                                 1,
+                                 128,
+                                 4,
+                                 192,
+                                 {2, 2, 2},
+                                 {2, 224, 224},
+                                 {1, 224, 224},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
     this->template Run<3>();
 }
-- 
GitLab


From 5e6bd75a725e2c77459bb045b814b7eaded948f9 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Fri, 6 Dec 2024 09:56:27 -0600
Subject: [PATCH 106/153] Add copy assignment op test (#1718)

* Add copy assignment op test

* Add a deep copy testing
---
 test/data_type/test_custom_type.cpp | 82 +++++++++++++++++++++--------
 1 file changed, 59 insertions(+), 23 deletions(-)

diff --git a/test/data_type/test_custom_type.cpp b/test/data_type/test_custom_type.cpp
index a8fa9ba4a..b8c0d402a 100644
--- a/test/data_type/test_custom_type.cpp
+++ b/test/data_type/test_custom_type.cpp
@@ -51,8 +51,11 @@ TEST(Custom_bool, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_bool_t, size> left_vec{right_vec};
+    vector_type<custom_bool_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_bool_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i));
@@ -129,8 +132,11 @@ TEST(Custom_int8, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_int8_t, size> left_vec{right_vec};
+    vector_type<custom_int8_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_int8_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -207,8 +213,11 @@ TEST(Custom_uint8, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_uint8_t, size> left_vec{right_vec};
+    vector_type<custom_uint8_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_uint8_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -287,8 +296,11 @@ TEST(Custom_f8, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_f8_t, size> left_vec{right_vec};
+    vector_type<custom_f8_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_f8_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -369,8 +381,11 @@ TEST(Custom_bf8, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_bf8_t, size> left_vec{right_vec};
+    vector_type<custom_bf8_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_bf8_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -450,8 +465,11 @@ TEST(Custom_half, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_half_t, size> left_vec{right_vec};
+    vector_type<custom_half_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_half_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i));
@@ -533,8 +551,11 @@ TEST(Custom_bhalf, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_bhalf_t, size> left_vec{right_vec};
+    vector_type<custom_bhalf_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_bhalf_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i));
@@ -615,8 +636,11 @@ TEST(Custom_float, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_float_t, size> left_vec{right_vec};
+    vector_type<custom_float_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_float_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i));
@@ -693,8 +717,11 @@ TEST(Custom_double, TestAsType)
     ck::static_for<0, size, 1>{}([&](auto i) {
         right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)};
     });
-    // copy the vector
-    vector_type<custom_double_t, size> left_vec{right_vec};
+    vector_type<custom_double_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_double_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i));
@@ -813,8 +840,11 @@ TEST(Complex_half, TestAsType)
         right_vec.template AsType<complex_half_t>()(Number<i>{}) =
             complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)};
     });
-    // copy the vector
-    vector_type<complex_half_t, size> left_vec{right_vec};
+    vector_type<complex_half_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<complex_half_t, size>{};
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
         ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real,
@@ -907,8 +937,11 @@ TEST(FP8OCP, TestAsType)
         right_vec.template AsType<f8_t>()(Number<i>{}) = ck::type_convert<f8_t>(test_vec.at(i));
     });
 
-    // copy the vector
-    vector_type<f8_t, size> left_vec{right_vec};
+    vector_type<f8_t, size> left_vec;
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<f8_t, size>{};
 
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
@@ -984,8 +1017,11 @@ TEST(BF8OCP, TestAsType)
         right_vec.template AsType<bf8_t>()(Number<i>{}) = ck::type_convert<bf8_t>(test_vec.at(i));
     });
 
-    // copy the vector
     vector_type<bf8_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<bf8_t, size>{};
 
     // check if values were copied correctly
     ck::static_for<0, size, 1>{}([&](auto i) {
-- 
GitLab


From 355893cdd85418f3174a023aeb1ddba008951660 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 6 Dec 2024 13:04:25 -0800
Subject: [PATCH 107/153] Refactor CI performance tests. (#1726)

* merge the build and performance tests CI stages together

* add gemm performance test on gfx11/gfx12

* add suffices to distinguish gemm performance logs from different archs

* use smaller gemm set in CI for gfx10/gfx11/gfx12

* disable performance tests on gfx1030

* fix the shashing logic

* fix finding python3 for mha instances
---
 Jenkinsfile                                   | 286 ++++++------------
 .../gpu/mha/CMakeLists.txt                    |   6 +-
 script/process_perf_data.py                   |   4 +-
 script/process_perf_data.sh                   |  13 +
 script/process_qa_data.sh                     |  12 +
 script/run_full_performance_tests.sh          |   2 +-
 script/run_gemm_performance_tests.sh          |  41 +++
 script/run_performance_tests.sh               |  21 +-
 8 files changed, 176 insertions(+), 209 deletions(-)
 create mode 100755 script/run_gemm_performance_tests.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index 58cd72c8c..0a98cc5c6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -330,10 +330,8 @@ def cmake_build(Map conf=[:]){
         try{
             archiveArtifacts "perf_fmha_fwd_*.log"
             archiveArtifacts "perf_fmha_bwd_*.log"
-            stash name: "perf_fmha_fwd_gfx942.log"
-            stash name: "perf_fmha_bwd_gfx942.log"
-            stash name: "perf_fmha_fwd_gfx90a.log"
-            stash name: "perf_fmha_bwd_gfx90a.log"
+            stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942"
+            stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a"
         }
         catch(Exception err){
             echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
@@ -408,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
     }
 }
 
-def runCKProfiler(Map conf=[:]){
-        show_node_info()
-
-        env.HSA_ENABLE_SDMA=0
-        checkout scm
-
-        def image = getDockerImageName()
-        def prefixpath = conf.get("prefixpath", "/opt/rocm")
-
-        // Jenkins is complaining about the render group 
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
-        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
-        }
-        def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
-        def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
-        dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
-        echo "Docker flags: ${dockerOpts}"
-
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-
-        def variant = env.STAGE_NAME
-        def retimage
-
-        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
-            try {
-                (retimage, image) = getDockerImage(conf)
-                withDockerContainer(image: image, args: dockerOpts) {
-                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'rocminfo | tee rocminfo.log'
-                        if ( !runShell('grep -n "gfx" rocminfo.log') ){
-                            throw new Exception ("GPU not found")
-                        }
-                        else{
-                            echo "GPU is OK"
-                        }
-                    }
-                }
-            }
-            catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
-                echo "The job was cancelled or aborted"
-                throw e
-            }
-
-            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 24, unit: 'HOURS')
-                {
-                    sh """
-                        rm -rf build
-                        mkdir build
-                    """
-                    dir("build"){
-                        unstash 'ckProfiler.tar.gz'
-                        sh 'tar -xvf ckProfiler.tar.gz'
-                    }
-
-					dir("script"){
-                        if (params.RUN_FULL_QA){
-                            sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
-                            archiveArtifacts "perf_gemm.log"
-                            archiveArtifacts "perf_resnet50_N256.log"
-                            archiveArtifacts "perf_resnet50_N4.log"
-                            archiveArtifacts "perf_batched_gemm.log"
-                            archiveArtifacts "perf_grouped_gemm.log"
-                            archiveArtifacts "perf_grouped_conv_fwd.log"
-                            archiveArtifacts "perf_grouped_conv_bwd_data.log"
-                            archiveArtifacts "perf_grouped_conv_bwd_weight.log"
-                            archiveArtifacts "perf_gemm_bilinear.log"
-                            archiveArtifacts "perf_reduction.log"
-                            archiveArtifacts "perf_splitK_gemm.log"
-                            archiveArtifacts "perf_onnx_gemm.log"
-                            archiveArtifacts "perf_mixed_gemm.log"
-                           // stash perf files to master
-                            stash name: "perf_gemm.log"
-                            stash name: "perf_resnet50_N256.log"
-                            stash name: "perf_resnet50_N4.log"
-                            stash name: "perf_batched_gemm.log"
-                            stash name: "perf_grouped_gemm.log"
-                            stash name: "perf_grouped_conv_fwd.log"
-                            stash name: "perf_grouped_conv_bwd_data.log"
-                            stash name: "perf_grouped_conv_bwd_weight.log"
-                            stash name: "perf_gemm_bilinear.log"
-                            stash name: "perf_reduction.log"
-                            stash name: "perf_splitK_gemm.log"
-                            stash name: "perf_onnx_gemm.log"
-                            stash name: "perf_mixed_gemm.log"
-                            //we will process results on the master node
-                        }
-                        else{
-                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
-                            archiveArtifacts "perf_gemm.log"
-                            archiveArtifacts "perf_resnet50_N256.log"
-                            archiveArtifacts "perf_resnet50_N4.log"
-                            // stash perf files to master
-                            stash name: "perf_gemm.log"
-                            stash name: "perf_resnet50_N256.log"
-                            stash name: "perf_resnet50_N4.log"
-                            //we will process the results on the master node
-                        }
-					}
-                }
-            }
-        }
-        return retimage
-}
-
-def runPerfTest(Map conf=[:]){
-    try{
-        runCKProfiler(conf)
-    }
-    catch(e){
-        echo "throwing error exception in performance tests"
-        echo 'Exception occurred: ' + e.toString()
-        throw e
-    }
-    finally{
-        if (!conf.get("no_reboot", false)) {
-            reboot()
-        }
-    }
-}
-
 def Build_CK(Map conf=[:]){
         show_node_info()
 
@@ -589,36 +465,95 @@ def Build_CK(Map conf=[:]){
                 throw e
             }
             withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 24, unit: 'HOURS')
+                timeout(time: 12, unit: 'HOURS')
                 {
                     //check whether to run performance tests on this node
-                    def do_perf_tests = 0
+                    def arch_type = 0
                     sh 'rocminfo | tee rocminfo.log'
-                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){
-                        do_perf_tests = 1
-                        echo "Stash profiler and run performance tests"
+                    if ( runShell('grep -n "gfx90a" rocminfo.log') ){
+                        arch_type = 1
+                    }
+                    else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
+                        arch_type = 2
+                    }
+                    else if ( runShell('grep -n "gfx1030" rocminfo.log') ) {
+                        arch_type = 3
+                    }
+                    else if ( runShell('grep -n "gfx1101" rocminfo.log') ) {
+                        arch_type = 4
+                    }
+                    else if ( runShell('grep -n "gfx1201" rocminfo.log') ) {
+                        arch_type = 5
                     }
                     cmake_build(conf)
                     dir("build"){
-                        //run tests and examples
-                        //sh 'make -j check'
-                        if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){
-                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                            //do not stash profiler on nodes where we don't need to run performance tests
-                            sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
-                            stash name: "ckProfiler.tar.gz"
-                        }
-                        if (params.RUN_FULL_QA && do_perf_tests == 0 ){
-                            // build deb packages for all gfx9 targets and prepare to export
+                        if (params.RUN_FULL_QA && arch_type == 1 ){
+                            // build deb packages for all gfx9 targets on gfx90a system and prepare to export
+                            echo "Build ckProfiler package"
                             sh 'make -j package'
                             archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
-                            archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
                             sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
-                            stash name: "ckprofiler_0.2.0_amd64.deb"
+                            stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb"
+                        }
+                    }
+                    // run performance tests, stash the logs, results will be processed on the master node
+					dir("script"){
+                        if (params.RUN_PERFORMANCE_TESTS){
+                        if (params.RUN_FULL_QA && arch_type == 1){
+                            // run full tests on gfx90a
+                            echo "Run full performance tests"
+                            sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            archiveArtifacts "perf_gemm.log"
+                            archiveArtifacts "perf_resnet50_N256.log"
+                            archiveArtifacts "perf_resnet50_N4.log"
+                            archiveArtifacts "perf_batched_gemm.log"
+                            archiveArtifacts "perf_grouped_gemm.log"
+                            archiveArtifacts "perf_grouped_conv_fwd.log"
+                            archiveArtifacts "perf_grouped_conv_bwd_data.log"
+                            archiveArtifacts "perf_grouped_conv_bwd_weight.log"
+                            archiveArtifacts "perf_gemm_bilinear.log"
+                            archiveArtifacts "perf_reduction.log"
+                            archiveArtifacts "perf_splitK_gemm.log"
+                            archiveArtifacts "perf_onnx_gemm.log"
+                            archiveArtifacts "perf_mixed_gemm.log"
+                            stash includes: "perf_**.log", name: "perf_log"
+                        }
+                        else if ( arch_type == 1 ){
+                            // run standard tests on gfx90a
+                            echo "Run performance tests"
+                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            archiveArtifacts "perf_gemm.log"
+                            archiveArtifacts "perf_onnx_gemm.log"
+                            archiveArtifacts "perf_resnet50_N256.log"
+                            archiveArtifacts "perf_resnet50_N4.log"
+                            stash includes: "perf_**.log", name: "perf_log"
+                        }
+                        // disable performance tests on gfx1030 for now.
+                        //else if ( arch_type == 3){
+                            // run basic tests on gfx1030
+                        //    echo "Run gemm performance tests"
+                        //    sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10"
+                        //    archiveArtifacts "perf_onnx_gemm_gfx10.log"
+                        //    stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10"
+                        //}
+                        else if ( arch_type == 4){
+                            // run basic tests on gfx11
+                            echo "Run gemm performance tests"
+                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11"
+                            archiveArtifacts "perf_onnx_gemm_gfx11.log"
+                            stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11"
+                        }
+                        else if ( arch_type == 5 ){
+                            // run basic tests on gfx12
+                            echo "Run gemm performance tests"
+                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
+                            archiveArtifacts "perf_onnx_gemm_gfx12.log"
+                            stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
+                        }                        
                         }
                     }
-                    if (params.hipTensor_test && do_perf_tests == 0 ){
-                        //build and test hipTensor
+                    if (params.hipTensor_test && arch_type == 1 ){
+                        // build and test hipTensor on gfx90a node
                         sh """#!/bin/bash
                             rm -rf "${params.hipTensor_branch}".zip
                             rm -rf hipTensor-"${params.hipTensor_branch}"
@@ -690,10 +625,8 @@ def process_results(Map conf=[:]){
                 dir("script"){
                     if (params.RUN_CK_TILE_FMHA_TESTS){
                         try{
-                            unstash "perf_fmha_fwd_gfx942.log"
-                            unstash "perf_fmha_bwd_gfx942.log"
-                            unstash "perf_fmha_fwd_gfx90a.log"
-                            unstash "perf_fmha_bwd_gfx90a.log"
+                            unstash "perf_fmha_log_gfx942"
+                            unstash "perf_fmha_log_gfx90a"
                         }
                         catch(Exception err){
                             echo "could not locate the FMHA performance logs: ${err.getMessage()}."
@@ -703,26 +636,26 @@ def process_results(Map conf=[:]){
                         // unstash perf files to master
                         unstash "ckprofiler_0.2.0_amd64.deb"
                         sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
-                        unstash "perf_gemm.log"
-                        unstash "perf_resnet50_N256.log"
-                        unstash "perf_resnet50_N4.log"
-                        unstash "perf_batched_gemm.log"
-                        unstash "perf_grouped_gemm.log"
-                        unstash "perf_grouped_conv_fwd.log"
-                        unstash "perf_grouped_conv_bwd_data.log"
-                        unstash "perf_grouped_conv_bwd_weight.log"
-                        unstash "perf_gemm_bilinear.log"
-                        unstash "perf_reduction.log"
-                        unstash "perf_splitK_gemm.log"
-                        unstash "perf_onnx_gemm.log"
-                        unstash "perf_mixed_gemm.log"
+                        unstash "perf_log"
+                        try{
+                            unstash "perf_log_gfx11"
+                            unstash "perf_log_gfx12"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
+                        }
                         sh "./process_qa_data.sh"
                     }
                     else{
                         // unstash perf files to master
-                        unstash "perf_gemm.log"
-                        unstash "perf_resnet50_N256.log"
-                        unstash "perf_resnet50_N4.log"
+                        unstash "perf_log"
+                        try{
+                            unstash "perf_log_gfx11"
+                            unstash "perf_log_gfx12"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
+                        }
                         sh "./process_perf_data.sh"
                     }
                 }
@@ -1241,29 +1174,6 @@ pipeline {
                 }
             }
         }
-
-        stage("Performance Tests")
-        {
-            parallel
-            {
-                stage("Run ckProfiler: gfx90a")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
-                    }
-                    options { retry(1) }
-                    agent{ label rocmnode("gfx90a")}
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                    }
-                    steps{
-                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
-                        cleanWs()
-                    }
-                }
-            }
-        }
         stage("Process Performance Test Results")
         {
             parallel
diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
index a53fde166..0457588ea 100644
--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
@@ -6,7 +6,7 @@ set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/)
 # CK Codegen requires dataclass which is added in Python 3.7
 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
 if(NOT CK_USE_ALTERNATIVE_PYTHON)
-   find_package(PythonInterp 3 REQUIRED)
+   find_package(Python3 COMPONENTS Interpreter Development)
 else()
    message("Using alternative python version")
    set(EXTRA_PYTHON_PATH)
@@ -33,7 +33,7 @@ set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd")
 # Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
 # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
 execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
   --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt
   --api ${FMHA_KNOWN_APIS}
   --receipt 3
@@ -50,7 +50,7 @@ endif()
 # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
 add_custom_command(
   OUTPUT ${FMHA_GEN_BLOBS}
-  COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
   --output_dir ${FMHA_CPP_FOLDER}
   --api ${FMHA_KNOWN_APIS}
   --receipt 3
diff --git a/script/process_perf_data.py b/script/process_perf_data.py
index 3892206e4..fbfec94ee 100644
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -82,7 +82,7 @@ def parse_logfile(logfile):
     StrideA=[]
     StrideB=[]
     StrideC=[]
-    if 'perf_gemm.log' in logfile:
+    if 'perf_gemm' in logfile and 'gemm_bilinear' not in logfile:
         for line in open(logfile):
             if 'Best Perf' in line:
                 lst=line.split()
@@ -260,7 +260,7 @@ def main():
         conn = sqlEngine.connect()
 
         #save gemm performance tests:
-        if 'perf_gemm.log' in filename:
+        if 'perf_gemm' in filename and 'gemm_bilinear' not in filename:
             #write the ck_gemm_test_params table only needed once the test set changes
             #post_test_params(test_list,conn)
             for i in range(1,len(results)+1):
diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh
index af1e7e7a0..ae9346320 100755
--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -11,9 +11,22 @@
 
 #process results
 python3 process_perf_data.py perf_gemm.log
+python3 process_perf_data.py perf_onnx_gemm.log
 python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
 
+file=./perf_onnx_gemm_gfx10.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx10.log
+fi
+file=./perf_onnx_gemm_gfx11.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx11.log
+fi
+file=./perf_onnx_gemm_gfx12.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx12.log
+fi
 file=./perf_fmha_fwd_gfx942.log
 if [ -e "$file" ]; then
     python3 process_perf_data.py perf_fmha_fwd_gfx942.log
diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh
index c9a1645f6..fb8fe01c6 100755
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -24,6 +24,18 @@ python3 process_perf_data.py perf_splitK_gemm.log
 python3 process_perf_data.py perf_onnx_gemm.log
 python3 process_perf_data.py perf_mixed_gemm.log
 
+file=./perf_onnx_gemm_gfx10.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx10.log
+fi
+file=./perf_onnx_gemm_gfx11.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx11.log
+fi
+file=./perf_onnx_gemm_gfx12.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx12.log
+fi
 file=./perf_fmha_fwd_gfx942.log
 if [ -e "$file" ]; then
     python3 process_perf_data.py perf_fmha_fwd_gfx942.log
diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh
index e167ce012..ddc5c270b 100755
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -5,7 +5,7 @@
 # post your new test results to the database and compare them to the baseline
 # please contact Illia.Silin@amd.com for more details
 #
-# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
+# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verifuy correctness on CPU (may take a long time)
diff --git a/script/run_gemm_performance_tests.sh b/script/run_gemm_performance_tests.sh
new file mode 100755
index 000000000..12adad30f
--- /dev/null
+++ b/script/run_gemm_performance_tests.sh
@@ -0,0 +1,41 @@
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
+# run the script as "./run_gemm_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name> <arch>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verify correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname
+# arch             : GPU architecture, e.g. "gfx9" or "gfx1100"
+
+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export branch=$3
+echo 'Branch name: ' $branch
+export host_name=$4
+echo 'Host name: ' $host_name
+export arch=$5
+echo 'GPU architecture: ' $arch
+
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+#run ONNX gemm tests
+export onnx_log="perf_onnx_gemm_$arch.log"
+print_log_header $onnx_log $env_type $branch $host_name
+./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
+./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh
index 317d27098..c8a281dc0 100755
--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
@@ -1,7 +1,7 @@
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
+# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verify correctness on CPU (may take a long time)
@@ -51,20 +51,11 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log
 
-#run grouped_fwd fp16 tests
-export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log"
-print_log_header $conv_fwd_log $env_type $branch $host_name
-./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
-
-#run grouped_bwd_data fp16 tests
-export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log"
-print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
-./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
-
-#run grouped_bwd_weight fp16 tests
-export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log"
-print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
-./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+#run ONNX gemm tests
+export onnx_log="perf_onnx_gemm.log"
+print_log_header $onnx_log $env_type $branch $host_name
+./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
+./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
 
 #run resnet50 tests
 export resnet256_log="perf_resnet50_N256.log"
-- 
GitLab


From c773cc25a235dbc3c044b9cf7fb32910bc8fcae0 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 9 Dec 2024 08:50:36 -0800
Subject: [PATCH 108/153] remove unnecessary file (#1732)

---
 modified_files.txt | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100755 modified_files.txt

diff --git a/modified_files.txt b/modified_files.txt
deleted file mode 100755
index 34a42e3f3..000000000
--- a/modified_files.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
-example/01_gemm/run_gemm_example_streamk_v2.inc
-include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
-include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
-library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
-library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
-library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
-library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
-profiler/src/profile_gemm_universal_streamk.cpp
-modified_files.txt
-- 
GitLab


From 2f088b870764d406ec453987198deb298f3e9e3a Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 9 Dec 2024 09:32:14 -0800
Subject: [PATCH 109/153] update CI timeout limits (#1733)

---
 Jenkinsfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0a98cc5c6..cb344e8a5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -377,7 +377,7 @@ def buildHipClangJob(Map conf=[:]){
 
         gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
             withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 48, unit: 'HOURS')
+                timeout(time: 20, unit: 'HOURS')
                 {
                     cmake_build(conf)
                 }
@@ -449,7 +449,7 @@ def Build_CK(Map conf=[:]){
             try {
                 (retimage, image) = getDockerImage(conf)
                 withDockerContainer(image: image, args: dockerOpts) {
-                    timeout(time: 5, unit: 'MINUTES'){
+                    timeout(time: 2, unit: 'MINUTES'){
                         sh 'rocminfo | tee rocminfo.log'
                         if ( !runShell('grep -n "gfx" rocminfo.log') ){
                             throw new Exception ("GPU not found")
@@ -465,7 +465,7 @@ def Build_CK(Map conf=[:]){
                 throw e
             }
             withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 12, unit: 'HOURS')
+                timeout(time: 20, unit: 'HOURS')
                 {
                     //check whether to run performance tests on this node
                     def arch_type = 0
@@ -620,7 +620,7 @@ def process_results(Map conf=[:]){
     }
 
     withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-        timeout(time: 1, unit: 'HOURS'){
+        timeout(time: 15, unit: 'MINUTES'){
             try{
                 dir("script"){
                     if (params.RUN_CK_TILE_FMHA_TESTS){
-- 
GitLab


From 23cf2026b496140e73a2990199f79e6257b228c7 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:11:20 -0800
Subject: [PATCH 110/153] build CI for gfx12 by default (#1734)

---
 Jenkinsfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index cb344e8a5..f118d4e45 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -675,8 +675,8 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
                                               0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
-                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
                                               0 13 * * * % BUILD_LEGACY_OS=true''' : ""
 
@@ -763,8 +763,8 @@ pipeline {
             description: "Test building instances for various architectures simultaneously (default: OFF)")
         booleanParam(
             name: "BUILD_GFX12",
-            defaultValue: false,
-            description: "Build CK and run tests on gfx12 (default: OFF)")
+            defaultValue: true,
+            description: "Build CK and run tests on gfx12 (default: ON)")
         booleanParam(
             name: "NINJA_BUILD_TRACE",
             defaultValue: false,
-- 
GitLab


From 94ae7113bd05e3c39364193dba1b391a4c54a2f4 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Tue, 10 Dec 2024 11:36:18 +0800
Subject: [PATCH 111/153] [CK TILE] Use config name instead of data type in
 FmhaFwdTypeConfig<config> (#1731)

* Add data type config, Prepare to add mix precision in the future

* Fix compile error
---
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py | 15 ++-
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   | 14 +--
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 11 ++-
 .../01_fmha/codegen/ops/fmha_fwd_appendkv.py  |  9 +-
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   | 27 ++---
 example/ck_tile/01_fmha/fmha_bwd.cpp          | 14 +--
 example/ck_tile/01_fmha/fmha_bwd.hpp          | 12 ++-
 example/ck_tile/01_fmha/fmha_fwd.cpp          | 99 ++++++++++---------
 example/ck_tile/01_fmha/fmha_fwd.hpp          | 32 +++++-
 9 files changed, 142 insertions(+), 91 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 66691356a..f6df44a31 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -2,10 +2,17 @@
 # Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
 
-DTYPE_MAP = {
-    "fp16": "ck_tile::fp16_t",
-    "bf16": "ck_tile::bf16_t",
-    "fp8" : "ck_tile::fp8_t"
+FWD_DTYPE_MAP = {
+    "fp16"   : "FmhaFwdFp16",
+    "bf16"   : "FmhaFwdBf16",
+    "fp8"    : "FmhaFwdFp8",
+    "fp8fp16": "FmhaFwdFp8Fp16",
+    "fp8bf16": "FmhaFwdFp8Bf16"
+}
+
+BWD_DTYPE_MAP = {
+    "fp16": "FmhaBwdFp16",
+    "bf16": "FmhaBwdBf16"
 }
 
 MASK_IMPL = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 096394c0c..83a1e82d6 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -283,7 +283,7 @@ class FmhaBwdApiPool:
                         inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
                                     F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
                                     F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
-                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype],
+                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
                                     F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                     F_deterministic=BOOL_MAP[trait.deterministic])
 
@@ -360,7 +360,7 @@ class FmhaBwdDQDKDVKernel:
             FMHA_BWD_DQ_DK_DV_KERNEL_BODY.format(
                 F_idx           = self.F_idx,
                 F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = BWD_DTYPE_MAP[self.F_dtype],
                 F_bm0           = self.F_tile.F_bm0,
                 F_bn0           = self.F_tile.F_bn0,
                 F_bk0           = self.F_tile.F_bk0,
@@ -469,7 +469,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
     gen = list()
     api_pool = FmhaBwdApiPool(mask_impl)
 
-    for dtype in DTYPE_MAP.keys():
+    for dtype in BWD_DTYPE_MAP.keys():
         d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
         if d == None:
             continue
@@ -585,7 +585,7 @@ class FmhaBwdOGradDotOKernel:
             FMHA_BWD_DOT_DO_O_KERNEL_BODY.format(
                 F_idx       = self.F_idx,
                 F_hdim      = self.F_hdim,
-                F_dtype     = DTYPE_MAP[self.F_dtype],
+                F_dtype     = BWD_DTYPE_MAP[self.F_dtype],
                 F_spad      = BOOL_MAP[self.F_spad],
                 F_dvpad     = BOOL_MAP[self.F_dvpad],
                 F_mode      = MODE_MAP[self.F_mode],
@@ -616,7 +616,7 @@ def get_bwd_dot_do_o_blobs() -> List[FmhaBwdOGradDotOKernel]:
 
     gen = list()
 
-    for dtype in DTYPE_MAP.keys():
+    for dtype in BWD_DTYPE_MAP.keys():
         d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
         if d == None:
             continue
@@ -716,7 +716,7 @@ class FmhaBwdConvertQGradKernel:
             FMHA_BWD_CONVERT_DQ_KERNEL_BODY.format(
                 F_idx           = self.F_idx,
                 F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = BWD_DTYPE_MAP[self.F_dtype],
                 F_bm0           = self.F_bm0,
                 F_bn0           = self.F_bn0,
                 F_spad          = BOOL_MAP[self.F_spad],
@@ -751,7 +751,7 @@ def get_bwd_convert_dq_blobs() -> List[FmhaBwdConvertQGradKernel]:
 
     gen = list()
 
-    for dtype in DTYPE_MAP.keys():
+    for dtype in BWD_DTYPE_MAP.keys():
         d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
         if d == None:
             continue
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index e5ee1d22e..eca638784 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -282,7 +282,7 @@ class FmhaFwdApiPool:
                                    F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                    F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
                 per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
@@ -301,7 +301,7 @@ class FmhaFwdTileSize:
     F_bk1       : int  # tile size along kv gemm unroll
     F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
     F_rm0       : int  # number of warps for gemm0 along q seqlen
-    F_rn0       : int  # number of warps for gemm0 along k seqlen 
+    F_rn0       : int  # number of warps for gemm0 along k seqlen
     F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
     F_rm1       : int  # number of warps for gemm1 along q seqlen
     F_rn1       : int  # number of warps for gemm1 along head dim v
@@ -339,7 +339,7 @@ class FmhaFwdKernel:
             FMHA_FWD_KERNEL_BODY.format(
                 F_idx           = self.F_idx,
                 F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                 F_bm0           = self.F_tile.F_bm0,
                 F_bn0           = self.F_tile.F_bn0,
                 F_bk0           = self.F_tile.F_bk0,
@@ -462,6 +462,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
             # no need lse/dropout kernels
             for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
                 pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', 'f', squant, mask))
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
         else:
             assert False
         return pipelines
@@ -469,7 +472,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
     gen = list()
     api_pool = FmhaFwdApiPool(mask_impl)
 
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
         d = get_fmha_fwd_tile_dict_from_dtype(dtype)
         if d == None:
             continue
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index cfd1d01c9..fb998a33d 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -181,7 +181,7 @@ class FmhaFwdAppendKVApiPool:
                     inners = inners + FMHA_FWD_APPENDKV_API_INNER_DISPATCH.format(F_if=if_k, F_vlayout=LAYOUT_MAP[trait.vlayout],
                                    F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_rope_check=ROPE_CHECK_MAP[trait.rope],
                                    F_pagedkv=BOOL_MAP[trait.pagedkv], F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
+                                   F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
                 per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
@@ -216,7 +216,7 @@ class FmhaFwdAppendKVKernel:
             FMHA_FWD_APPENDKV_KERNEL_BODY.format(
                 F_idx           = self.F_idx,
                 F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                 F_bs            = self.F_tile.F_bs,
                 F_bsk           = self.F_tile.F_bsk,
                 F_bd            = self.F_tile.F_bd,
@@ -301,6 +301,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
         elif dtype in ['fp8', 'bf8']:
             # rope/paged-kv is not supported
             pipelines.append(FmhaFwdAppendKVPipeline('col', 't', 't', 't', 't', 'no', 'f'))
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
         else:
             assert False
         return pipelines
@@ -308,7 +311,7 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
     gen = list()
     api_pool = FmhaFwdAppendKVApiPool(mask_impl)
 
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
         d = get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype)
         if d == None:
             continue
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 1c40cf6f3..e448902cf 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -112,7 +112,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }}
 
 using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, 
+                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
                         {F_dvpad}>;
 
 #include <iostream>
@@ -161,7 +161,7 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem<
     typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
     typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
     {F_hdim},
-    {F_bm0}, 
+    {F_bm0},
     {F_bn1},
     {F_mode},
     fmha_trait>;
@@ -231,11 +231,11 @@ float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a
     if(s.log_level_ > 0)
     std::cout
     << ", " << fmha_fwd_splitkv_get_name_<fmha_fwd_splitkv_traits_>()
-    << ", " << fmha_fwd_splitkv_combine_get_name_<fmha_fwd_splitkv_combine_traits_>() 
+    << ", " << fmha_fwd_splitkv_combine_get_name_<fmha_fwd_splitkv_combine_traits_>()
     << std::flush;
 
     return ck_tile::launch_kernel(s,
-        [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_<fmha_fwd_splitkv_traits_>(s_, a); }}, 
+        [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_<fmha_fwd_splitkv_traits_>(s_, a); }},
         [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_combine_oneshot_<fmha_fwd_splitkv_combine_traits_>(s_, a); }}
     );
 }}
@@ -431,11 +431,11 @@ class FmhaFwdSplitKVApiPool:
                     inners = inners + FMHA_FWD_SPLITKV_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
                                    F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask],
                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                   F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], 
+                                   F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv],
                                    F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                    F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
                 per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
@@ -472,7 +472,7 @@ class FmhaFwdSplitKVKernel:
             FMHA_FWD_SPLITKV_KERNEL_BODY.format(
                 F_idx           = self.F_idx,
                 F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                 F_bm0           = self.F_tile.F_bm0,
                 F_bn0           = self.F_tile.F_bn0,
                 F_bk0           = self.F_tile.F_bk0,
@@ -492,7 +492,7 @@ class FmhaFwdSplitKVKernel:
                 F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
                 F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
                 F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
-                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],    
+                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
                 F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
                 F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
                 F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
@@ -552,7 +552,7 @@ class FmhaFwdSplitKVCombineKernel:
             FMHA_FWD_SPLITKV_COMBINE_KERNEL_BODY.format(
                 F_idx           = self.F_idx,
                 F_hdim          = self.F_hdim,
-                F_dtype         = DTYPE_MAP[self.F_dtype],
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
                 F_bm0           = self.F_tile.F_bm0,
                 F_bn1           = self.F_tile.F_bn1,
                 F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
@@ -625,7 +625,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
-                # TODO: use async pipeline when compiler is more stable 
+                # TODO: use async pipeline when compiler is more stable
                 if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
                 # if True:
                     pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
@@ -644,6 +644,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
         elif dtype in ['fp8', 'bf8']:
             for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
                 pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask))
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
         else:
             assert False
         return pipelines
@@ -651,7 +654,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
     gen = list()
     api_pool = FmhaFwdSplitKVApiPool(mask_impl)
 
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
         d = get_fmha_fwd_tile_dict_from_dtype(dtype)
         if d == None:
             continue
@@ -711,7 +714,7 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
 
     gen = list()
 
-    for dtype in DTYPE_MAP.keys():
+    for dtype in FWD_DTYPE_MAP.keys():
         d = get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype)
         if d == None:
             continue
diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp
index 2d76627a7..eaf99529f 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -101,7 +101,7 @@ auto create_args(int argc, char* argv[])
 }
 
 // different threshold for different dtype
-template <typename DataType>
+template <typename DataTypeConfig>
 auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
 {
     double rtol = 1e-2;
@@ -110,7 +110,7 @@ auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
 }
 
 template <>
-auto get_elimit<ck_tile::bf16_t>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v)
+auto get_elimit<FmhaBwdBf16>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v)
 {
     double rtol = 1e-2;
     double atol = 1e-2;
@@ -122,7 +122,7 @@ auto get_elimit<ck_tile::bf16_t>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_
     return ck_tile::make_tuple(rtol, atol);
 }
 
-template <typename DataType>
+template <typename DataTypeConfig>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     std::string data_type    = arg_parser.get_str("prec");
@@ -209,7 +209,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     const auto seqstart_q_host = generate_seqstarts(mode, batch, seqlen_q);
     const auto seqstart_k_host = generate_seqstarts(mode, batch, seqlen_k);
 
-    using TypeConfig = FmhaBwdTypeConfig<DataType>;
+    using TypeConfig = FmhaBwdTypeConfig<DataTypeConfig>;
 
     using QDataType             = typename TypeConfig::QDataType;
     using KDataType             = typename TypeConfig::KDataType;
@@ -933,7 +933,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }
         // clang-format on
 
-        auto [rtol, atol] = get_elimit<DataType>(hdim_q, hdim_v);
+        auto [rtol, atol] = get_elimit<DataTypeConfig>(hdim_q, hdim_v);
         bool dq_cur_pass  = ck_tile::check_err(dq_host_result,
                                               dq_host_ref,
                                               std::string("Error: QGrad Incorrect results!"),
@@ -986,11 +986,11 @@ int main(int argc, char* argv[])
     const std::string data_type = arg_parser.get_str("prec");
     if(data_type == "fp16")
     {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        return run<FmhaBwdFp16>(arg_parser) ? 0 : -2;
     }
     else if(data_type == "bf16")
     {
-        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+        return run<FmhaBwdBf16>(arg_parser) ? 0 : -2;
     }
 
     return -3;
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 722ef15a2..6204cbcfa 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -14,11 +14,19 @@
 #include <utility>
 #include <variant>
 
+struct FmhaBwdFp16
+{
+};
+
+struct FmhaBwdBf16
+{
+};
+
 template <typename DataType>
 struct FmhaBwdTypeConfig;
 
 template <>
-struct FmhaBwdTypeConfig<ck_tile::half_t>
+struct FmhaBwdTypeConfig<FmhaBwdFp16>
 {
     using QDataType             = ck_tile::half_t;
     using KDataType             = ck_tile::half_t;
@@ -38,7 +46,7 @@ struct FmhaBwdTypeConfig<ck_tile::half_t>
 };
 
 template <>
-struct FmhaBwdTypeConfig<ck_tile::bf16_t>
+struct FmhaBwdTypeConfig<FmhaBwdBf16>
 {
     using QDataType             = ck_tile::bf16_t;
     using KDataType             = ck_tile::bf16_t;
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 1f0d73d95..ebf2c93a3 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -142,7 +142,7 @@ auto create_args(int argc, char* argv[])
 }
 
 // different threshold for different dtype
-template <typename DataType>
+template <typename DataTypeConfig>
 auto get_elimit(std::string /*init_method*/)
 {
     double rtol = 1e-3;
@@ -151,7 +151,7 @@ auto get_elimit(std::string /*init_method*/)
 }
 
 template <>
-auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+auto get_elimit<FmhaFwdBf16>(std::string /*init_method*/)
 {
     double rtol = 1e-2;
     double atol = 1e-2;
@@ -159,7 +159,7 @@ auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
 }
 
 template <>
-auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+auto get_elimit<FmhaFwdFp8>(std::string init_method)
 {
     if(init_method == "ui" || init_method == "ni")
     {
@@ -261,7 +261,7 @@ int override_num_splits_if_necessary(
     return num_splits;
 }
 
-template <typename DataType>
+template <typename DataTypeConfig>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     std::string data_type    = arg_parser.get_str("prec");
@@ -305,8 +305,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 
     ck_tile::index_t rotary_dim = arg_parser.get_int("rotary_dim");
-    if constexpr(!(std::is_same_v<DataType, ck_tile::fp16_t> ||
-                   std::is_same_v<DataType, ck_tile::bf16_t>))
+    if constexpr(!(std::is_same_v<DataTypeConfig, FmhaFwdFp16> ||
+                   std::is_same_v<DataTypeConfig, FmhaFwdBf16>))
     {
         if(0 < rotary_dim)
         {
@@ -428,25 +428,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
             return atoi(squant_str.c_str()) != 0 ? true : false;
     }();
 
-    float range_q = arg_parser.get_float("range_q");
-    float range_k = arg_parser.get_float("range_k");
-    float range_v = arg_parser.get_float("range_v");
-    float range_p = arg_parser.get_float("range_p");
-    float range_o = arg_parser.get_float("range_o");
-
-    float dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<DataType>::max());
-
-    float scale_p = 1.f;
-    float scale_o = 1.f;
-
-    if(squant)
-    {
-        scale_s = scale_s * (range_q / dtype_max) * (range_k / dtype_max);
-        scale_p = dtype_max / range_p;
-        // scale_p = [max(fp8_t)/range_o] * [range_p/max(fp8_t)] * [range_v/max(fp8_t)]
-        scale_o = range_p * range_v / range_o / dtype_max;
-    }
-
     std::string vlayout = arg_parser.get_str("vlayout");
     bool lse            = arg_parser.get_bool("lse");
 
@@ -499,7 +480,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     const auto seqstart_k_host              = to_seqstarts(seqlen_ks);
     const auto seqstart_k_with_padding_host = to_seqstarts(seqlen_kpads);
 
-    using TypeConfig = FmhaFwdTypeConfig<DataType>;
+    using TypeConfig = FmhaFwdTypeConfig<DataTypeConfig>;
 
     using QDataType             = typename TypeConfig::QDataType;
     using KDataType             = typename TypeConfig::KDataType;
@@ -513,6 +494,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using OaccDataType          = typename TypeConfig::OaccDataType;
     using ODataType             = typename TypeConfig::ODataType;
 
+    float range_q = arg_parser.get_float("range_q");
+    float range_k = arg_parser.get_float("range_k");
+    float range_v = arg_parser.get_float("range_v");
+    float range_p = arg_parser.get_float("range_p");
+    float range_o = arg_parser.get_float("range_o");
+
+    float q_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<QDataType>::max());
+    float k_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<KDataType>::max());
+    float v_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<VDataType>::max());
+    float p_dtype_max = v_dtype_max; // assume p and v is the same type
+    float o_dtype_max = ck_tile::type_convert<float>(ck_tile::numeric<ODataType>::max());
+
+    float scale_p = 1.f;
+    float scale_o = 1.f;
+
+    if(squant)
+    {
+        scale_s = scale_s * (range_q / q_dtype_max) * (range_k / k_dtype_max);
+        scale_p = p_dtype_max / range_p;
+        scale_o = (o_dtype_max / range_o) * (range_p / p_dtype_max) * (range_v / v_dtype_max);
+    }
+
     // accumulation numbers for performance evaluation
     std::size_t flop = 0, num_byte = 0;
     auto max_seqlen_q =
@@ -709,14 +712,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
     else if(init_method == "ufq" || init_method == "uf:q" ||
             init_method == "3") // suitable for fp8 quantization
     {
-        ck_tile::FillUniformDistribution<QDataType>{-dtype_max, dtype_max, seed}(q_host);
-        ck_tile::FillUniformDistribution<KDataType>{-dtype_max, dtype_max, seed}(k_host);
-        ck_tile::FillUniformDistribution<KDataType>{-dtype_max, dtype_max, seed}(knew_host);
-        ck_tile::FillUniformDistribution<VDataType>{-dtype_max, dtype_max, seed}(v_host);
-        ck_tile::FillUniformDistribution<VDataType>{-dtype_max, dtype_max, seed}(vnew_host);
+        ck_tile::FillUniformDistribution<QDataType>{-q_dtype_max, q_dtype_max, seed}(q_host);
+        ck_tile::FillUniformDistribution<KDataType>{-k_dtype_max, k_dtype_max, seed}(k_host);
+        ck_tile::FillUniformDistribution<KDataType>{-k_dtype_max, k_dtype_max, seed}(knew_host);
+        ck_tile::FillUniformDistribution<VDataType>{-v_dtype_max, v_dtype_max, seed}(v_host);
+        ck_tile::FillUniformDistribution<VDataType>{-v_dtype_max, v_dtype_max, seed}(vnew_host);
 
         // bias_fp8 = qscale_bias * bias_fp32
-        float qscale_bias = (dtype_max / range_q) * (dtype_max / range_k);
+        float qscale_bias = (q_dtype_max / range_q) * (k_dtype_max / range_k);
         // Assume bias is in [-1.f, 1.f] in original fp32
         ck_tile::FillUniformDistribution<BiasDataType>{-qscale_bias, qscale_bias, seed}(bias_host);
     }
@@ -1129,14 +1132,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
     randval_buf.FromDevice(randval_host.data());
 
     auto p_compute_element_func = [&]() {
-        if constexpr(std::is_same_v<DataType, ck_tile::fp8_t>)
+        if constexpr(std::is_same_v<DataTypeConfig, ck_tile::fp8_t>)
             return ck_tile::scales{scale_p};
         else
             return ck_tile::identity{};
     }();
 
     auto oacc_element_func = [&]() {
-        if constexpr(std::is_same_v<DataType, ck_tile::fp8_t>)
+        if constexpr(std::is_same_v<DataTypeConfig, ck_tile::fp8_t>)
             return ck_tile::composes(ck_tile::saturates<ck_tile::fp8_t>{},
                                      ck_tile::scales{scale_o});
         else
@@ -1186,7 +1189,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         {
             decltype(q_host_ref) q_host_ref_ro(q_host_ref.get_lengths());
 
-            auto [rotary_cos_slice, rotary_sin_slice] = 
+            auto [rotary_cos_slice, rotary_sin_slice] =
                 slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], real_seqlen_q);
 
             ck_tile::reference_batched_rotary_position_embedding(
@@ -1202,13 +1205,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 k_host_ref.ForEach([&](auto& self, auto i) {
                     self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[0] / nr, i[1] % page_block_size, i[2]);
                 });
-            } else {     
+            } else {
                 k_host_ref.ForEach([&](auto& self, auto i) {
                     self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[1] % page_block_size, i[0] / nr, i[2]);
                 });
             }
         } else
-#endif 
+#endif
         {
             if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[0] / nr, i[1] + key_offset, i[2]); });
             else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[1] + key_offset, i[0] / nr, i[2]); });
@@ -1229,7 +1232,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             {
                 knew_host_ref_ro.emplace(knew_host_ref.get_lengths());
 
-                auto [rotary_cos_slice, rotary_sin_slice] = 
+                auto [rotary_cos_slice, rotary_sin_slice] =
                     slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], seqlen_knew);
 
                 ck_tile::reference_batched_rotary_position_embedding(
@@ -1251,19 +1254,19 @@ bool run(const ck_tile::ArgParser& arg_parser)
         if(0 < page_block_size) {
             if(is_v_rowmajor) {
                 if(i_perm) {
-                    v_host_ref.ForEach([&](auto& self, auto i) { 
-                        self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); 
+                    v_host_ref.ForEach([&](auto& self, auto i) {
+                        self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]);
                     });
                 } else {
-                    v_host_ref.ForEach([&](auto& self, auto i) { 
+                    v_host_ref.ForEach([&](auto& self, auto i) {
                         self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[2] % page_block_size, i[0] / nr, i[1]);
                     });
                 }
             }
-            else 
+            else
             {
-                if(i_perm) { 
-                    v_host_ref.ForEach([&](auto& self, auto i) { 
+                if(i_perm) {
+                    v_host_ref.ForEach([&](auto& self, auto i) {
                         self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[1], i[2] % page_block_size);
                     });
                 } else {
@@ -1458,7 +1461,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         else       o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[1] + query_offset, idx[0], idx[2]); });
         // clang-format on
 
-        auto [rtol, atol] = get_elimit<DataType>(init_method);
+        auto [rtol, atol] = get_elimit<DataTypeConfig>(init_method);
         bool cur_pass     = ck_tile::check_err(
             o_host_result, o_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
         pass &= cur_pass;
@@ -1515,15 +1518,15 @@ int main(int argc, char* argv[])
     const std::string data_type = arg_parser.get_str("prec");
     if(data_type == "fp16")
     {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        return run<FmhaFwdFp16>(arg_parser) ? 0 : -2;
     }
     else if(data_type == "bf16")
     {
-        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+        return run<FmhaFwdBf16>(arg_parser) ? 0 : -2;
     }
     else if(data_type == "fp8")
     {
-        return run<ck_tile::fp8_t>(arg_parser) ? 0 : -2;
+        return run<FmhaFwdFp8>(arg_parser) ? 0 : -2;
     }
 
     return -3;
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 8a821b917..aee54b475 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -16,11 +16,35 @@
 #include <utility>
 #include <variant>
 
+struct FmhaFwdFp16
+{
+};
+
+struct FmhaFwdBf16
+{
+};
+
+struct FmhaFwdFp8
+{
+};
+
+struct FmhaFwdBf8
+{
+};
+
+struct FmhaFwdFp8Fp16
+{
+};
+
+struct FmhaFwdFp8Bf16
+{
+};
+
 template <typename DataType>
 struct FmhaFwdTypeConfig;
 
 template <>
-struct FmhaFwdTypeConfig<ck_tile::half_t>
+struct FmhaFwdTypeConfig<FmhaFwdFp16>
 {
     using QDataType             = ck_tile::half_t;
     using KDataType             = ck_tile::half_t;
@@ -36,7 +60,7 @@ struct FmhaFwdTypeConfig<ck_tile::half_t>
 };
 
 template <>
-struct FmhaFwdTypeConfig<ck_tile::bf16_t>
+struct FmhaFwdTypeConfig<FmhaFwdBf16>
 {
     using QDataType             = ck_tile::bf16_t;
     using KDataType             = ck_tile::bf16_t;
@@ -52,7 +76,7 @@ struct FmhaFwdTypeConfig<ck_tile::bf16_t>
 };
 
 template <>
-struct FmhaFwdTypeConfig<ck_tile::fp8_t>
+struct FmhaFwdTypeConfig<FmhaFwdFp8>
 {
     using QDataType             = ck_tile::fp8_t;
     using KDataType             = ck_tile::fp8_t;
@@ -68,7 +92,7 @@ struct FmhaFwdTypeConfig<ck_tile::fp8_t>
 };
 
 template <>
-struct FmhaFwdTypeConfig<ck_tile::bf8_t>
+struct FmhaFwdTypeConfig<FmhaFwdBf8>
 {
     using QDataType             = ck_tile::bf8_t;
     using KDataType             = ck_tile::bf8_t;
-- 
GitLab


From 67497a044d450fbc0bcb099cfb0aa270cfb0aa6b Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com>
Date: Tue, 10 Dec 2024 16:47:36 +0000
Subject: [PATCH 112/153] Make sure we call __hneg with half to remove ambigios
 error (#1736)

---
 include/ck/utility/math_v2.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index a6c3540d8..eaa1c6813 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -611,7 +611,7 @@ inline __device__ int8_t neg<int8_t>(int8_t x)
 template <>
 inline __device__ half_t neg<half_t>(half_t x)
 {
-    return __hneg(x);
+    return __hneg(static_cast<__half>(x));
 };
 
 template <typename T>
-- 
GitLab


From 90d8410d562220ba65e7e75f10e7b3996409200f Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 10 Dec 2024 08:48:51 -0800
Subject: [PATCH 113/153] Upgrade to Ubuntu22.04 as default OS. (#1738)

* upgrade to ubuntu 22.04

* try adding -u roof docker options for ubuntu 22
---
 Dockerfile          | 5 +++--
 Dockerfile.compiler | 2 +-
 Jenkinsfile         | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6689ae08f..8ce158a20 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:20.04
+FROM ubuntu:22.04
 ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=6.3
 ARG compiler_version=""
@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     libnuma-dev \
     libpthread-stubs0-dev \
     llvm-amdgpu \
+    mpich \
     net-tools \
     pkg-config \
     python \
@@ -70,7 +71,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     rm -rf /var/lib/apt/lists/* && \
     rm -rf amdgpu-install* && \
 # Remove unnecessary rocm components that take a lot of space
-    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev
+    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
 
 # Update the cmake to version 3.27.5
 RUN pip install --upgrade cmake==3.27.5 && \
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index 3f3329092..a22103b96 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
diff --git a/Jenkinsfile b/Jenkinsfile
index f118d4e45..f82c34afa 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -40,10 +40,10 @@ def getBaseDockerImageName(){
     else{
         def ROCM_numeric = "${params.ROCMVERSION}" as float
         if ( ROCM_numeric < 6.4 ){
-            img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
+            img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}"
             }
         else{
-            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
+            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}"
             }
         }
     return img
@@ -357,7 +357,7 @@ def buildHipClangJob(Map conf=[:]){
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
@@ -426,7 +426,7 @@ def Build_CK(Map conf=[:]){
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
-- 
GitLab


From 357a0b1c57d2f6b4eb9607d26047ba2e0b679f72 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 10 Dec 2024 15:16:03 -0800
Subject: [PATCH 114/153] add missing stdexcept header (#1740)

---
 codegen/test/rtc/include/rtc/hip.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/codegen/test/rtc/include/rtc/hip.hpp b/codegen/test/rtc/include/rtc/hip.hpp
index 6b523382d..e962d4cd3 100644
--- a/codegen/test/rtc/include/rtc/hip.hpp
+++ b/codegen/test/rtc/include/rtc/hip.hpp
@@ -4,6 +4,7 @@
 #include <hip/hip_runtime_api.h>
 #include <memory>
 #include <string>
+#include <stdexcept>
 
 namespace rtc {
 
-- 
GitLab


From 77a38e0211f587775c233fc0afd4de819d51500c Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 12 Dec 2024 03:54:03 +0000
Subject: [PATCH 115/153] [CK_TILE] naive attn (#1708)

* add reference attention fwd

* refactor addresser

* update

* paged, and i8 reflect-quant

* lets call it forward-quant

* fix error in decode variation

* update naive-attn

* fix page table

* fix build err
---
 example/ck_tile/01_fmha/fmha_fwd.cpp    |  57 +-
 include/ck_tile/README.md               |   3 +
 include/ck_tile/core.hpp                |   1 +
 include/ck_tile/ops/gemm.hpp            |   2 +-
 include/ck_tile/ref/README.md           |   5 +
 include/ck_tile/ref/naive_attention.hpp | 666 ++++++++++++++++++++++++
 include/ck_tile/remod.py                |   4 +
 7 files changed, 734 insertions(+), 4 deletions(-)
 create mode 100644 include/ck_tile/ref/README.md
 create mode 100644 include/ck_tile/ref/naive_attention.hpp

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index ebf2c93a3..08d263da9 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -3,6 +3,7 @@
 
 #include "fmha_fwd.hpp"
 #include "ck_tile/host.hpp"
+#include "ck_tile/ref/naive_attention.hpp"
 #include "mask.hpp"
 #include "rotary.hpp"
 #include "utils.hpp"
@@ -41,7 +42,7 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "weather do CPU validation or not")
+    arg_parser.insert("v", "1", "0:no validation, 2:cpu validation, 2:gpu validation(experimental)")
         .insert("mode", "0", "kernel mode. 0:batch, 1:group")
         .insert("b", "2", "batch size")
         .insert("h", "8", "num of head, for q")
@@ -447,7 +448,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 
     bool s_randval = false;
-    if(p_drop > 0.0f && do_validation)
+    if(p_drop > 0.0f && do_validation != 0)
     {
         s_randval = true;
     }
@@ -1121,11 +1122,61 @@ bool run(const ck_tile::ArgParser& arg_parser)
               << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
               << " GB/s" << std::flush;
 
-    if(!do_validation)
+    if(do_validation == 0)
     {
         std::cout << std::flush << std::endl;
         return true;
     }
+    if(do_validation == 2)
+    {
+        // NOTE: use gpu to do validation
+        ck_tile::naive_attention_fwd_traits naive_t;
+        naive_t.q_type    = data_type;
+        naive_t.k_type    = data_type;
+        naive_t.v_type    = data_type;
+        naive_t.o_type    = data_type;
+        naive_t.q_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.k_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.v_layout  = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.o_layout  = o_perm == 1 ? "bhsd" : "bshd";
+        naive_t.variation = 0; // TODO?
+
+        ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes());
+
+        ck_tile::naive_attention_fwd_args naive_a;
+        naive_a.q_ptr           = q_buf.GetDeviceBuffer();
+        naive_a.k_ptr           = k_buf.GetDeviceBuffer();
+        naive_a.v_ptr           = v_buf.GetDeviceBuffer();
+        naive_a.o_ptr           = o_naive_buf.GetDeviceBuffer();
+        naive_a.scale_s         = scale_s;
+        naive_a.context_len_ptr = nullptr; // used when seqlen kv come from a pointer
+        naive_a.page_table_ptr =
+            nullptr; // [batch, num_blocks] seqlen_kv is in different block(paged attn)
+        naive_a.hdim           = hdim_q;
+        naive_a.hdim_v         = hdim_v; // could be cross-attn, where V and Q/K hdim are different
+        naive_a.batch_q        = batch;
+        naive_a.batch_kv       = batch;
+        naive_a.batch_ratio_kv = 1; // batch_q / batch_kv
+        naive_a.seqlen_q       = seqlen_qs[0];
+        naive_a.seqlen_kv = seqlen_ks[0]; // if context_len_ptr is not nullptr, ignore this field
+        naive_a.nhead_q   = nhead;
+        naive_a.nhead_kv  = nhead_k;
+        naive_a.nhead_ratio_kv = naive_a.nhead_q / naive_a.nhead_kv; // nhead_q / nhead_kv
+        naive_a.page_size      = 0; // if paged, the seqlen-kv for each block
+
+        ck_tile::stream_config naive_s{};
+
+        naive_attention_fwd(naive_t, naive_a, naive_s);
+
+        auto o_naive_ref = o_naive_buf.ToHost<ODataType>();
+        o_buf.FromDevice(o_host.data()); // TODO: ugly
+
+        auto [rtol_, atol_] = get_elimit<DataTypeConfig>(init_method);
+        bool pass_          = ck_tile::check_err(
+            o_host, o_naive_ref, std::string("OUT Error: Incorrect results!"), rtol_, atol_);
+        std::cout << ", valid:" << (pass_ ? "y" : "n") << std::flush << std::endl;
+        return pass_;
+    }
 
     o_buf.FromDevice(o_host.data());
     lse_buf.FromDevice(lse_host.data());
diff --git a/include/ck_tile/README.md b/include/ck_tile/README.md
index 9f88af1ca..9d5e92391 100644
--- a/include/ck_tile/README.md
+++ b/include/ck_tile/README.md
@@ -45,5 +45,8 @@ our implementation of different device operators.
 **[ops/epilogue]**  
 epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues.
 
+**[ref]**  
+reference implementation of cpu or gpu. This folder is supposed to include a specific header on demand.
+
 ## examples
 currently we put all ck_tile related example under [/example/ck_tile](/example/ck_tile/) folder. Please check each example's subfolder.
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 3cf0c2595..41f3383c7 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -54,6 +54,7 @@
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/tile_window_utils.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
+#include "ck_tile/core/utility/amd_address_space.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 82d35b9c5..2d38ef592 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -23,10 +23,10 @@
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
-#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
diff --git a/include/ck_tile/ref/README.md b/include/ck_tile/ref/README.md
new file mode 100644
index 000000000..6efee782f
--- /dev/null
+++ b/include/ck_tile/ref/README.md
@@ -0,0 +1,5 @@
+# reference
+
+this folder contains reference implementation of a specific op. Note by including a specific header, you are including the implementation(expecially the gpu implementation) into your source code, and compile that kernel into the fatbin, hence may increase your kernel obj code length. Usually the header starts with `reference_` is a cpu reference implementation. The header starts with `naive_` contains a gpu implementation with a small launcher.
+
+TODO: move `host/reference` under this folder
diff --git a/include/ck_tile/ref/naive_attention.hpp b/include/ck_tile/ref/naive_attention.hpp
new file mode 100644
index 000000000..09ded761e
--- /dev/null
+++ b/include/ck_tile/ref/naive_attention.hpp
@@ -0,0 +1,666 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include <thread>
+#include <string>
+
+namespace ck_tile {
+
+enum class naive_attention_layout_enum
+{
+    BSHD,  // [batch, seqlen, nhead, hdim]
+    BHSD,  // [batch, nhead, seqlen, hdim]
+    BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed
+    PHSD,  // [pages, nhead, page_size, hdim]
+    // PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen
+    PHDSX, // [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen
+    PHDS,  // [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen
+};
+
+// will used to specialize kernel variation
+enum class naive_attention_variation_enum
+{
+    FLASH_BATCHED = 0, // standard flash attention, or xformer/sdpa, used for training
+    FLASH_GROUPED,
+    DECODE_PAGED, // decode attn, where kv token from another buffer called kvcache
+};
+
+// TODO: for simplicity, this will be used as host/device arg
+struct naive_attention_fwd_args
+{
+    void* q_ptr;
+    void* k_ptr;
+    void* v_ptr;
+    void* o_ptr;
+    void* context_len_ptr; // [batch] used when seqlen kv come from a pointer(each element is a
+                           // number, not cumsum)
+    void* page_table_ptr;  // [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn)
+    void* kvscale_ptr;     // [nhead, 2(kv), hdim] used for kvcache dequant
+    float scale_s;
+    int hdim;
+    int hdim_v; // could be cross-attn, where V and Q/K hdim are different
+    int batch_q;
+    int batch_kv;
+    int batch_ratio_kv; // batch_q / batch_kv
+    int seqlen_q;       // in decode case, this should be 1
+    int seqlen_kv;      // if context_len_ptr is not nullptr, ignore this field
+    int nhead_q;
+    int nhead_kv;
+    int nhead_ratio_kv; // nhead_q / nhead_kv
+    int page_size;      // if paged, the seqlen-kv per each block
+    int max_pages_per_seq;
+};
+
+// this is trait for host API
+struct naive_attention_fwd_traits
+{
+    std::string q_type;
+    std::string k_type;
+    std::string v_type;
+    std::string o_type;
+    std::string q_layout;
+    std::string k_layout;
+    std::string v_layout;
+    std::string o_layout;
+    int variation; // sync with naive_attention_variation_enum
+};
+
+// this is trait for kernel template
+template <naive_attention_variation_enum variation_>
+struct naive_attention_fwd_kernel_traits
+{
+    static constexpr naive_attention_variation_enum variation = variation_;
+};
+
+// for simplicity, please do not use const-reference type for the template type
+template <typename QType,
+          typename KType,
+          typename VType,
+          typename OType,
+          typename AccType,
+          naive_attention_layout_enum QLayout,
+          naive_attention_layout_enum KLayout,
+          naive_attention_layout_enum VLayout,
+          naive_attention_layout_enum OLayout,
+          typename Traits>
+struct naive_attention_fwd_kernel
+{
+    static constexpr bool is_kvcache_i8 =
+        std::is_same_v<KType, int8_t> && std::is_same_v<VType, int8_t> && sizeof(QType) != 1;
+
+    // kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original
+    // K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32
+    static constexpr bool is_kvcache_i8_forward_quant = is_kvcache_i8;
+
+    // TODO: hardcode
+    using KVScaleType = float;
+    using SoftmaxType = float;
+    using PType       = VType; // src A of gemm2, same type as V
+
+    using p_vec_type                = ext_vector_t<PType, 16 / sizeof(PType)>;
+    static constexpr int p_vec_elem = vector_traits<p_vec_type>::vector_size;
+
+    __host__ __device__ naive_attention_fwd_kernel() {}
+
+    template <typename T, naive_attention_layout_enum Layout>
+    struct addresser
+    {
+        int b, s, h, d; // batch, seqlen, nhead, hdim
+        T* base_ptr;
+        __device__ addresser(int b_, int s_, int h_, int d_, void* base_ptr_)
+            : b(b_), s(s_), h(h_), d(d_), base_ptr(reinterpret_cast<T*>(base_ptr_))
+        {
+        }
+
+        // TODO: all the batch/nhead offset will accumulate to the base pointer
+        __device__ T* get_base(int i_b, int i_h)
+        {
+            if constexpr(Layout == naive_attention_layout_enum::BSHD)
+                return base_ptr + i_b * s * h * d + i_h * d;
+            else if constexpr(Layout == naive_attention_layout_enum::BHSD)
+                return base_ptr + i_b * s * h * d + i_h * s * d;
+        }
+
+        __device__ int get_offset(int i_s, int i_d)
+        {
+            if constexpr(Layout == naive_attention_layout_enum::BSHD)
+                return i_s * h * d + i_d;
+            else if constexpr(Layout == naive_attention_layout_enum::BHSD)
+                return i_s * d + i_d;
+        }
+
+        // below set of API will directly use pointer inside this struct
+        __device__ void init(int i_b, int i_h) { base_ptr = get_base(i_b, i_h); }
+        __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; }
+        __device__ void store(T value, int i_s, int i_d) { base_ptr[get_offset(i_s, i_d)] = value; }
+    };
+
+    template <typename T, naive_attention_layout_enum Layout>
+    struct page_addresser
+    {
+        int s, h, d;                             // page_size, nhead, hdim
+        static constexpr int x = 16 / sizeof(T); // pack 4 dword
+        T* base_ptr;
+        int* page_table_ptr; // TODO: page table always int
+        int i_h;             // store current head
+
+        __device__ page_addresser(int s_, int h_, int d_, void* base_ptr_, void* pptr_)
+            : s(s_),
+              h(h_),
+              d(d_),
+              base_ptr(reinterpret_cast<T*>(base_ptr_)),
+              page_table_ptr(reinterpret_cast<int*>(pptr_))
+        {
+        }
+
+        __device__ int64_t get_phy_page_idx(int i_s)
+        {
+            // dynamic compute page idx is simple but slow
+            int page_idx = i_s / s;
+            int phy      = page_table_ptr[page_idx];
+            return static_cast<int64_t>(phy);
+        }
+
+        __device__ int get_phy_page_offset(int i_s)
+        {
+            // dynamic compute page idx is simple but slow
+            return i_s % s;
+        }
+
+        __device__ int64_t get_offset(int i_s, int i_d)
+        {
+            int page_offset  = get_phy_page_offset(i_s);
+            int64_t page_idx = get_phy_page_idx(i_s);
+            int64_t base_    = page_idx * h * s * d;
+            if constexpr(Layout == naive_attention_layout_enum::PHSD)
+                return static_cast<int64_t>(i_h * s * d + page_offset * d + i_d) + base_;
+            else if constexpr(Layout == naive_attention_layout_enum::PHDSX)
+            {
+                int d_r = i_d / x;
+                int d_x = i_d % x;
+                return static_cast<int64_t>(i_h * d * s + d_r * s * x + page_offset * x + d_x) +
+                       base_;
+            }
+            else if constexpr(Layout == naive_attention_layout_enum::PHDS)
+            {
+                return static_cast<int64_t>(i_h * d * s + i_d * s + page_offset) + base_;
+            }
+        }
+
+        // below set of API will directly use pointer inside this struct
+        __device__ void init(int /*i_b*/, int i_h_) { i_h = i_h_; }
+        __device__ T load(int i_s, int i_d) { return base_ptr[get_offset(i_s, i_d)]; }
+        __device__ void store(T /*value*/, int /*i_s*/, int /*i_d*/) {}
+    };
+
+    template <typename T>
+    struct kvscale_addresser
+    {
+        int h, d; // nhead, hdim
+        T* base_ptr;
+        __device__ kvscale_addresser(int h_, int d_, void* p_)
+            : h(h_), d(d_), base_ptr(reinterpret_cast<T*>(p_))
+        {
+        }
+        __device__ int get_offset(int i_h, int i_d, int i_kv /*0 or 1*/)
+        {
+            // [h, 2, d]
+            return i_h * 2 * d + i_kv * d + i_d;
+        }
+        __device__ T load(int i_h, int i_d, int i_kv)
+        {
+            return base_ptr[get_offset(i_h, i_d, i_kv)];
+        }
+    };
+
+    __device__ __host__ static constexpr int get_block_size() { return 256; }
+
+    // for simpliciy, 1 WG always compute 1 token along q, compute all token along kv
+    // compute all hdim from q, compute WG_SIZE hdim from v
+    // 1) in prefill case, seqlen_q >= 1, seqlen_kv >= 1, batch_q=batch_kv
+    // 2) in decode case, seqlen_q = 1, batch_q is input num-tokens, batch_kv is 1
+    // 3) in paged-attn case, we still use 1 WG compute all the seqlen-kv for simplicity
+    // TODO: could support split-kv to validate intermediate logsum
+    __host__ static dim3 get_grid_size(naive_attention_fwd_args args)
+    {
+        constexpr int wg_size = get_block_size();
+        auto g =
+            dim3((args.hdim_v + wg_size - 1) / wg_size, args.seqlen_q, args.batch_q * args.nhead_q);
+        return g;
+    }
+
+    // reduce single pixel within a wave
+    template <typename T, typename F>
+    __device__ constexpr T wave_reduce(T local, F reduce_f)
+    {
+        // constexpr int wave_size = 64;
+        constexpr int reduce_stage = 6; // 1<<6=64
+        T v_local                  = local;
+#pragma unroll
+        for(int i_stage = 0; i_stage < reduce_stage; i_stage++)
+        {
+            int src_lane = __lane_id() ^ (1 << i_stage);
+            int32_t v_remote_tmp =
+                __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
+            T v_remote = bit_cast<T>(v_remote_tmp);
+            v_local    = reduce_f(v_local, v_remote);
+        }
+        return v_local;
+    }
+
+    // Note: this function must be called after wave_reduce
+    // Note: better not use this under if...else... with thread divergence (syncthreads)
+    template <typename T, typename F>
+    __device__ constexpr T cross_wave_reduce(T local, F reduce_f, T* smem)
+    {
+        constexpr int waves     = 4;
+        constexpr int wave_size = 64;
+        int lane_id             = threadIdx.x % wave_size;
+
+        __syncthreads();
+        smem[threadIdx.x] = local;
+        __syncthreads();
+
+        // the data within single wave is the same
+        // but for simplicity, we still use data from each lane.
+        T v_local = smem[lane_id];
+#pragma unroll
+        for(int i_stage = 1; i_stage < waves; i_stage++)
+        {
+            T v_remote = smem[i_stage * wave_size + lane_id];
+            v_local    = reduce_f(v_local, v_remote);
+        }
+        return v_local;
+    }
+
+    // kernel entry point
+    __device__ void operator()(naive_attention_fwd_args args)
+    {
+        constexpr int wg_size = get_block_size();
+        __shared__ char smem[wg_size * 4 * sizeof(float)]; //  should enough
+        int i_dv    = blockIdx.x * wg_size + threadIdx.x;  // index of hdim_v
+        int i_sq    = blockIdx.y;                          // index of seqlen_q
+        int i_batch = blockIdx.z;                          // index of batch_q * nhead_q
+        int i_bq    = i_batch / args.nhead_q;              // index of batch_q
+        int i_hq    = i_batch % args.nhead_q;              // index of nhead_q
+
+        int i_bk = i_bq / args.batch_ratio_kv;
+        int i_hk = i_hq / args.nhead_ratio_kv;
+
+        void* page_table_ptr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return reinterpret_cast<int*>(args.page_table_ptr) + i_bq * args.max_pages_per_seq;
+            }
+            else
+            {
+                return nullptr;
+            }
+        }();
+
+        auto q_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<QType, QLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return addresser<QType, QLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim, args.q_ptr};
+            }
+        }();
+        auto k_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<KType, KLayout>{
+                    args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim, args.k_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return page_addresser<KType, KLayout>{
+                    args.page_size, args.nhead_kv, args.hdim, args.k_ptr, page_table_ptr};
+            }
+        }();
+        auto v_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<VType, VLayout>{
+                    args.batch_kv, args.seqlen_kv, args.nhead_kv, args.hdim_v, args.v_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return page_addresser<VType, VLayout>{
+                    args.page_size, args.nhead_kv, args.hdim_v, args.v_ptr, page_table_ptr};
+            }
+        }();
+        auto o_addr = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return addresser<OType, OLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr};
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return addresser<OType, OLayout>{
+                    args.batch_q, args.seqlen_q, args.nhead_q, args.hdim_v, args.o_ptr};
+            }
+        }();
+
+        q_addr.init(i_bq, i_hq);
+        k_addr.init(i_bk, i_hk);
+        v_addr.init(i_bk, i_hk);
+        o_addr.init(i_bq, i_hq);
+
+        auto f_max        = [](auto x_, auto y_) { return max(x_, y_); };
+        auto f_sum        = [](auto x_, auto y_) { return x_ + y_; };
+        auto f_absmax_f32 = [](float v_0_, float v_1_) {
+            float rtn;
+            asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+
+        int seqlen_kv = [&]() {
+            if constexpr(Traits::variation == naive_attention_variation_enum::FLASH_BATCHED)
+            {
+                return args.seqlen_kv;
+            }
+            else if constexpr(Traits::variation == naive_attention_variation_enum::DECODE_PAGED)
+            {
+                return reinterpret_cast<int*>(args.context_len_ptr)[i_bq];
+            }
+        }();
+
+        SoftmaxType row_max = -numeric<SoftmaxType>::infinity();
+        SoftmaxType l{0};
+        AccType o_acc = {0};
+
+        int sk_loops   = (seqlen_kv + wg_size - 1) / wg_size;
+        float qf_scale = .0f;
+        kvscale_addresser<KVScaleType> kvscale_addr{args.nhead_kv, args.hdim, args.kvscale_ptr};
+
+        if constexpr(is_kvcache_i8_forward_quant)
+        {
+            // AccType is i32 now, seqlen_q = 1, hdim up to 256
+            float q   = 0;
+            float k_s = 0;
+            if(static_cast<int>(threadIdx.x) < args.hdim)
+            {
+                q   = type_convert<float>(q_addr.load(0, threadIdx.x));
+                k_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 0));
+            }
+            // 1) we apply the k scale to q
+            float q_forwarded = q * k_s;
+
+            // 2) apply smooth-quant
+            // find absmax
+            float qf_max = wave_reduce(q_forwarded, f_absmax_f32);
+            qf_max       = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+
+            // per-token scale
+            qf_scale = qf_max / 127.0;
+
+            // devide by scale
+            q = q / qf_scale;
+
+            // fp32->i8
+            int8_t quantized_q = static_cast<int8_t>(q);
+            __syncthreads();
+            reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_q;
+            __syncthreads();
+
+            // after above process, we have 2 data
+            // 1) int8 q data stored in smem(no need to reload)
+            // 2) per-token scale qf_scale, to be mul after 1st gemm
+        }
+
+        for(int i_loop1 = 0; i_loop1 < sk_loops; i_loop1++)
+        {
+            int i_sk = i_loop1 * wg_size + threadIdx.x;
+            // gemm-1
+            SoftmaxType s_softmax = -numeric<SoftmaxType>::infinity();
+            if(i_sk < seqlen_kv)
+            {
+                AccType s_acc{0}; // clear for every loop
+                for(auto i_dq = 0; i_dq < args.hdim; i_dq++)
+                {
+                    if constexpr(is_kvcache_i8_forward_quant)
+                    {
+                        int8_t q = reinterpret_cast<int8_t*>(smem)[i_dq];
+                        auto k   = k_addr.load(i_sk, i_dq);
+
+                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
+                    }
+                    else
+                    {
+                        auto q = q_addr.load(i_sq, i_dq); // q will have duplicate load
+                        auto k = k_addr.load(i_sk, i_dq);
+
+                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
+                    }
+                }
+                // scale
+                s_softmax = type_convert<SoftmaxType>(s_acc);
+                s_softmax *=
+                    type_convert<SoftmaxType>(args.scale_s * ck_tile::log2e_v<SoftmaxType>);
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    s_softmax *= qf_scale; // post scale the per-token factor
+                }
+            }
+
+            // s->p
+            float pf_scale = 0.; // used for i8 quant
+            {
+                // softmax, find max
+                SoftmaxType old_max = row_max;
+                SoftmaxType cur_max = wave_reduce(s_softmax, f_max);
+
+                cur_max = cross_wave_reduce(cur_max, f_max, reinterpret_cast<SoftmaxType*>(smem));
+                row_max = max(old_max, cur_max); // update row_max
+                // softmax, exp(i_elem - max)
+                SoftmaxType p_compute = __builtin_amdgcn_exp2f(s_softmax - row_max);
+
+                // compute exp_sum
+                SoftmaxType row_sum = wave_reduce(p_compute, f_sum);
+                row_sum = cross_wave_reduce(row_sum, f_sum, reinterpret_cast<SoftmaxType*>(smem));
+
+                // l, pre-scall o_acc
+                SoftmaxType tmp = __builtin_amdgcn_exp2f(old_max - row_max);
+                l               = tmp * l + row_sum;
+                o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+
+                // prepare the p_compute into smem, to let every thread read same p_compute and do
+                // 2nd gemm
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    float v_s = 0;
+                    if(static_cast<int>(threadIdx.x) < args.hdim_v)
+                    {
+                        v_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 1));
+                    }
+
+                    // 1) we apply the v scale to p
+                    float p_forwarded = p_compute * v_s;
+
+                    // 2) apply smooth-quant
+                    // find absmax
+                    float pf_max = wave_reduce(p_forwarded, f_absmax_f32);
+                    pf_max =
+                        cross_wave_reduce(pf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+
+                    // per-token scale
+                    pf_scale = pf_max / 127.0;
+
+                    // devide by scale
+                    p_compute = p_compute / pf_scale;
+
+                    // fp32->i8
+                    int8_t quantized_p = static_cast<int8_t>(p_compute);
+                    __syncthreads();
+                    reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_p;
+                    __syncthreads();
+                    // after above process, we have 2 data
+                    // 1) int8 p data stored in smem(no need to reload)
+                    // 2) per-token scale pf_scale, to be mul after 2nd gemm
+                }
+                else
+                {
+                    __syncthreads();
+                    reinterpret_cast<PType*>(smem)[threadIdx.x] = type_convert<PType>(p_compute);
+                    __syncthreads();
+                }
+            }
+
+            // gemm-2, simple loop over vector by vector
+            constexpr int gemm_2_loop = wg_size / p_vec_elem;
+            {
+                AccType o_acc_local = {0};
+                int sk_start = i_loop1 * wg_size; // we start from the first seqlen_kv element
+                for(int i_loop2 = 0; i_loop2 < gemm_2_loop; i_loop2++)
+                {
+                    p_vec_type p_vec = reinterpret_cast<p_vec_type*>(smem)[i_loop2];
+#pragma unroll
+                    for(int i_j = 0; i_j < p_vec_elem; i_j++)
+                    {
+                        int sv_offset = i_loop2 * p_vec_elem + i_j;
+                        int i_sv      = sk_start + sv_offset;
+
+                        VType v = 0.f;
+                        if(i_dv < args.hdim_v && i_sv < seqlen_kv)
+                        {
+                            v = v_addr.load(i_sv, i_dv);
+                        }
+
+                        o_acc_local += type_convert<AccType>(p_vec[i_j]) * type_convert<AccType>(v);
+                    }
+                }
+                if constexpr(is_kvcache_i8_forward_quant)
+                {
+                    // apply pr scale to local acc
+                    o_acc_local =
+                        type_convert<AccType>(type_convert<float>(o_acc_local) * pf_scale);
+                }
+                o_acc += o_acc_local;
+            }
+        }
+
+        // post scale o_acc
+        {
+            SoftmaxType tmp = l == 0.f ? 0.f : 1.f / l; // in case masking
+            o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+        }
+
+        // store O
+        if(i_dv < args.hdim_v)
+            o_addr.store(type_convert<OType>(o_acc), i_sq, i_dv);
+    }
+};
+
+#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_()                                                        \
+    {                                                                                                       \
+        using ktraits_ =                                                                                    \
+            naive_attention_fwd_kernel_traits<static_cast<naive_attention_variation_enum>(                  \
+                variation_)>;                                                                               \
+        using k_   = naive_attention_fwd_kernel<q_type_,                                                    \
+                                              k_type_,                                                    \
+                                              v_type_,                                                    \
+                                              o_type_,                                                    \
+                                              acc_type_,                                                  \
+                                              q_layout_,                                                  \
+                                              k_layout_,                                                  \
+                                              v_layout_,                                                  \
+                                              o_layout_,                                                  \
+                                              ktraits_>;                                                  \
+        dim3 grids = k_::get_grid_size(a);                                                                  \
+        r          = ck_tile::launch_kernel(s,                                                              \
+                                   ck_tile::make_kernel(k_{}, grids, k_::get_block_size(), 0, a)); \
+    }
+
+#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_()                                                 \
+    if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \
+       t.o_layout == "bshd")                                                                       \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto v_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BSHD;                              \
+        constexpr int variation_ = 0;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }                                                                                              \
+    else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" &&                    \
+            t.v_layout == "bhsd" && t.o_layout == "bhsd")                                          \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto v_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr int variation_ = 0;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }                                                                                              \
+    else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" &&                   \
+            t.v_layout == "phds" && t.o_layout == "bhsd")                                          \
+    {                                                                                              \
+        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX;                             \
+        constexpr auto v_layout_ = naive_attention_layout_enum::PHDS;                              \
+        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
+        constexpr int variation_ = 2;                                                              \
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
+    }
+
+//
+CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t,
+                                       naive_attention_fwd_args a,
+                                       ck_tile::stream_config s)
+{
+    float r = -1;
+    // TODO: do not explicitly create too much instance!
+    if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16")
+    {
+        using q_type_   = fp16_t;
+        using k_type_   = fp16_t;
+        using v_type_   = fp16_t;
+        using o_type_   = fp16_t;
+        using acc_type_ = float;
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16")
+    {
+        using q_type_   = bf16_t;
+        using k_type_   = bf16_t;
+        using v_type_   = bf16_t;
+        using o_type_   = bf16_t;
+        using acc_type_ = float;
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16")
+    {
+        using q_type_   = bf16_t;
+        using k_type_   = int8_t;
+        using v_type_   = int8_t;
+        using o_type_   = bf16_t;
+        using acc_type_ = int32_t; // NOTE!
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "fp16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "fp16")
+    {
+        using q_type_   = fp16_t;
+        using k_type_   = int8_t;
+        using v_type_   = int8_t;
+        using o_type_   = fp16_t;
+        using acc_type_ = int32_t; // NOTE!
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    return r;
+}
+
+#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
+#undef CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_
+
+} // namespace ck_tile
diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py
index b0d2c36ef..9f2ef3389 100644
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -7,6 +7,7 @@ import copy
 
 NS = 'ck_tile'
 OPS = 'ops'
+REF = 'ref'
 OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
 
 HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
@@ -29,6 +30,9 @@ class submodule_t:
     def push(self, f):
         if len(f.parents) != 1: # ignore ./xxx.hpp
             mod = get_module(f)
+            # ref is supposed to include one header on demand
+            if mod == REF:
+                return
             if mod == OPS:
                 if mod not in self.m.keys():
                     self.m[mod] = dict()
-- 
GitLab


From 4e73177684817d425fc583b8827dd09d0c609e94 Mon Sep 17 00:00:00 2001
From: chenjun <46212055+junhaha666@users.noreply.github.com>
Date: Fri, 13 Dec 2024 11:53:52 +0800
Subject: [PATCH 116/153] Ck tile/smoothquant out stride (#1742)

* add ck_tile/smoothquant out stride parameter

* Remove the default stride value

---------

Co-authored-by: so <a.com>
---
 .../12_smoothquant/example_smoothquant.cpp    | 44 +++++++++++--------
 .../ck_tile/12_smoothquant/smoothquant.cpp    | 44 +++++++++++--------
 .../smoothquant/kernel/smoothquant_kernel.hpp | 20 ++++++---
 3 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
index 3a26eb6a7..aa1d1adfd 100644
--- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
@@ -35,7 +35,8 @@ auto create_args(int argc, char* argv[])
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("m", "3328", "m dimension")
         .insert("n", "4096", "n dimension")
-        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("x_stride", "-1", "input stride per row, if -1 then equal to n")
+        .insert("y_stride", "-1", "output stride per row, if -1 then equal to n")
         .insert("e", "1e-5", "epsilon")
         .insert("v", "1", "cpu validation or not")
         .insert("prec", "fp16", "precision")
@@ -49,11 +50,14 @@ auto create_args(int argc, char* argv[])
 template <typename DataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    ck_tile::index_t m      = arg_parser.get_int("m");
-    ck_tile::index_t n      = arg_parser.get_int("n");
-    ck_tile::index_t stride = arg_parser.get_int("stride");
-    if(stride < 0)
-        stride = n;
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
     std::string data_type = arg_parser.get_str("prec");
     int do_validation     = arg_parser.get_int("v");
     int warmup            = arg_parser.get_int("warmup");
@@ -68,14 +72,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using ComputeDataType = float;
 
     // host verify
-    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
     ck_tile::HostTensor<XScaleDataType> xscale_host({n});
 
     ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
     ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
 
-    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
-    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});
 
     ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
     ck_tile::FillUniformDistribution<XScaleDataType>{1e-3, .5f}(xscale_host);
@@ -116,7 +120,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                       qy_buf.GetDeviceBuffer(),
                                       m,
                                       n,
-                                      stride};
+                                      x_stride,
+                                      y_stride};
 
     auto kargs = Kernel::MakeKargs(args);
 
@@ -133,7 +138,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(do_validation)
     {
         using YDataType = ComputeDataType;
-        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {stride, 1});
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
         // smooth outlier
         {
             auto f = [&](auto n_) {
@@ -183,7 +188,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             qy_buf.FromDevice(qy_host_dev.data());
             auto [rtol, atol] = get_elimit<QYDataType>();
 
-            if(stride == n)
+            if(y_stride == n)
             {
                 pass = ck_tile::check_err(qy_host_dev,
                                           qy_host_ref,
@@ -195,10 +200,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
             {
                 for(int i_r = 0; i_r < m; i_r++)
                 {
-                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
-                                                            qy_host_dev.begin() + i_r * stride + n);
-                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
-                                                            qy_host_ref.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
+                                                            qy_host_dev.begin() + i_r * y_stride +
+                                                                n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
+                                                            qy_host_ref.begin() + i_r * y_stride +
+                                                                n);
                     pass &= ck_tile::check_err(qy_host_dev_row,
                                                qy_host_ref_row,
                                                std::string("qy[") + std::to_string(i_r) +
@@ -210,8 +217,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }
 
         std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", stride:" << stride
-                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+                  << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+                  << ", y_stride:" << y_stride << ", valid:" << (pass ? "y" : "n") << std::flush
+                  << std::endl;
     }
 
     return pass;
diff --git a/example/ck_tile/12_smoothquant/smoothquant.cpp b/example/ck_tile/12_smoothquant/smoothquant.cpp
index ed01d654f..fd1c4ec7b 100644
--- a/example/ck_tile/12_smoothquant/smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.cpp
@@ -33,7 +33,8 @@ auto create_args(int argc, char* argv[])
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("m", "3328", "m dimension")
         .insert("n", "4096", "n dimension")
-        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("x_stride", "-1", "input stride per row, if -1 then equal to n")
+        .insert("y_stride", "-1", "output stride per row, if -1 then equal to n")
         .insert("v", "1", "cpu validation or not")
         .insert("kname", "1", "print kernel name or not")
         .insert("prec", "fp16", "precision")
@@ -47,18 +48,21 @@ auto create_args(int argc, char* argv[])
 template <typename DataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    ck_tile::index_t m      = arg_parser.get_int("m");
-    ck_tile::index_t n      = arg_parser.get_int("n");
-    ck_tile::index_t stride = arg_parser.get_int("stride");
-    if(stride < 0)
-        stride = n;
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
     std::string data_type = arg_parser.get_str("prec");
     int kname             = arg_parser.get_int("kname");
     int do_validation     = arg_parser.get_int("v");
     int warmup            = arg_parser.get_int("warmup");
     int repeat            = arg_parser.get_int("repeat");
 
-    assert(stride >= n);
+    assert(x_stride >= n);
 
     using TypeConfig = SmoothquantTypeConfig<DataType>;
 
@@ -69,14 +73,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using ComputeDataType = typename TypeConfig::ComputeDataType;
 
     // host verify
-    ck_tile::HostTensor<XDataType> x_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
     ck_tile::HostTensor<XScaleDataType> xscale_host({n});
 
     ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
     ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
 
-    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
-    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});
 
     ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
     ck_tile::FillUniformDistribution<XScaleDataType>{1e-3, .5f}(xscale_host);
@@ -90,7 +94,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     xscale_buf.ToDevice(xscale_host.data());
 
     std::cout << "[" << data_type << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride
+              << std::flush;
 
     smoothquant_traits traits{data_type};
 
@@ -100,7 +105,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                           qy_buf.GetDeviceBuffer(),
                           m,
                           n,
-                          stride};
+                          x_stride,
+                          y_stride};
 
     float ave_time = smoothquant(
         traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
@@ -116,7 +122,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(do_validation)
     {
         using YDataType = ComputeDataType;
-        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {stride, 1});
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
         // smooth outlier
         {
             auto f = [&](auto n_) {
@@ -166,7 +172,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             qy_buf.FromDevice(qy_host_dev.data());
             auto [rtol, atol] = get_elimit<QYDataType>();
 
-            if(stride == n)
+            if(y_stride == n)
             {
                 pass = ck_tile::check_err(qy_host_dev,
                                           qy_host_ref,
@@ -178,10 +184,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
             {
                 for(int i_r = 0; i_r < m; i_r++)
                 {
-                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
-                                                            qy_host_dev.begin() + i_r * stride + n);
-                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
-                                                            qy_host_ref.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
+                                                            qy_host_dev.begin() + i_r * y_stride +
+                                                                n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
+                                                            qy_host_ref.begin() + i_r * y_stride +
+                                                                n);
                     pass &= ck_tile::check_err(qy_host_dev_row,
                                                qy_host_ref_row,
                                                std::string("qy[") + std::to_string(i_r) +
diff --git a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
index 6ec333516..0b3d9d6ca 100644
--- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
@@ -19,7 +19,8 @@ struct SmoothquantHostArgs
 
     index_t m;
     index_t n;
-    index_t stride; // row_stride
+    index_t x_stride; // input row_stride
+    index_t y_stride; // output row_stride
 };
 
 // TODO: Extract some type to wrapper class
@@ -58,14 +59,21 @@ struct Smoothquant
 
         index_t m;
         index_t n;
-        index_t stride; // row_stride
+        index_t x_stride; // input row_stride
+        index_t y_stride; // out row_stride
     };
     using Hargs = SmoothquantHostArgs;
 
     CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
     {
-        return Kargs{
-            hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride};
+        return Kargs{hargs.p_x,
+                     hargs.p_xscale,
+                     hargs.p_yscale,
+                     hargs.p_qy,
+                     hargs.m,
+                     hargs.n,
+                     hargs.x_stride,
+                     hargs.y_stride};
     }
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
@@ -116,7 +124,7 @@ struct Smoothquant
             const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<const XDataType*>(kargs.p_x),
                 make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.x_stride, 1),
                 number<Vector_N>{},
                 number<1>{});
 
@@ -157,7 +165,7 @@ struct Smoothquant
             auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<QYDataType*>(kargs.p_qy),
                 make_tuple(kargs.m, kargs.n),
-                make_tuple(kargs.stride, 1),
+                make_tuple(kargs.y_stride, 1),
                 number<Vector_N>{},
                 number<1>{});
 
-- 
GitLab


From 4d8fce33dddfc003432ae06848f6416a9d5d5e2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 13 Dec 2024 21:08:35 +0100
Subject: [PATCH 117/153] Add SplitK support into Batched GEMM V3 (#1729)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add bmm api

* add bf16 multi_d

* add ckProfiler for bf16

* add ckProfiler files

* add more instance; fixed 64bit index issue

* fixed naming

* enabled batched Ds

* use long_index for ds offsets

* clean

* add bmm fp8 ckProfiler

* Update example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* Update example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* Update example/24_batched_gemm/run_batched_gemm_example_rowwise.inc

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* Update profiler/src/profile_gemm_universal_batched.cpp

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* Update profiler/include/profiler/profile_gemm_universal_batched_impl.hpp

Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

* clean

* Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp

* Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp

* Update library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp

* Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp

* Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp

* Update include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp

* refactor batch offset func

* add splitk suppport into bmm_v3

* clean

* clean

* format

* fixed

* fix

---------

Co-authored-by: Jing Zhang <jizhan@fb.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
---
 .../batched_gemm_xdl_bf16_v3.cpp              |   4 +-
 .../device/device_batched_gemm_multi_d.hpp    |   3 +-
 ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp |  45 ++++--
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp |  16 +-
 ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp |   3 +
 ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp |   2 +
 .../profile_gemm_universal_batched_impl.hpp   | 148 ++++++++++--------
 .../src/profile_gemm_universal_batched.cpp    |  20 +--
 8 files changed, 137 insertions(+), 104 deletions(-)

diff --git a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
index fa8b75218..548500518 100644
--- a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
@@ -78,14 +78,14 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD
     2,              // ABlockTransferSrcVectorDim
     8,              // ABlockTransferSrcScalarPerVector
     8,              // ABlockTransferDstScalarPerVector_AK1
-    1,              // ABlockLdsExtraM
+    0,              // ABlockLdsExtraM
     S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_BK0_N_BK1
     S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
     2,              // BBlockTransferSrcVectorDim
     8,              // BBlockTransferSrcScalarPerVector
     8,              // BBlockTransferDstScalarPerVector_BK1
-    1,              // BBlockLdsExtraN
+    0,              // BBlockLdsExtraN
     1,              // CShuffleMXdlPerWavePerShuffle
     1,              // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
index 58c0288e8..8fb4a71f5 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
@@ -89,7 +89,8 @@ struct DeviceBatchedGemmV2MultiD : public BaseOperator
                         index_t BatchStrideE,
                         AElementwiseOperation a_element_op,
                         BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op) = 0;
+                        CDEElementwiseOperation cde_element_op,
+                        index_t KBatch) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index 314ecdf76..5f5bea4f8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -41,12 +41,15 @@ __global__ void
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;
 
     const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
     const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
     const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
     const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
 
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+
     // populate pointer, desc for Ds
     static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
         // D pointer
@@ -54,8 +57,8 @@ __global__ void
     });
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset,
-        karg.p_b_grid + b_batch_offset,
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
         karg.p_ds_grid,
         karg.p_c_grid + c_batch_offset,
         p_shared,
@@ -87,12 +90,15 @@ __global__ void
     __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;
 
     const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
     const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
     const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
     const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
 
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
+
     // populate pointer, desc for Ds
     static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
         // D pointer
@@ -100,8 +106,8 @@ __global__ void
     });
 
     GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset,
-        karg.p_b_grid + b_batch_offset,
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
         karg.p_ds_grid,
         karg.p_c_grid + c_batch_offset,
         p_shared_0,
@@ -303,7 +309,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                  index_t Batch_,
                  AElementwiseOperation a_element_op_,
                  BElementwiseOperation b_element_op_,
-                 CElementwiseOperation c_element_op_)
+                 CElementwiseOperation c_element_op_,
+                 index_t KBatch_)
             : GridwiseGemm::Argument{p_a_grid_,
                                      p_b_grid_,
                                      p_ds_grid_,
@@ -315,7 +322,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                      StrideB_,
                                      StrideDs_,
                                      StrideE_,
-                                     1,
+                                     KBatch_,
                                      a_element_op_,
                                      b_element_op_,
                                      c_element_op_},
@@ -336,13 +343,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                 arg.Print();
             }
 
-            if(!GridwiseGemm::CheckValidity(arg) || arg.KBatch > 1)
+            if(!GridwiseGemm::CheckValidity(arg))
             {
                 throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
             }
 
             index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch);
+            std::tie(gdx, gdy, gdz) =
+                GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch * arg.KBatch);
 
             float ave_time = 0;
 
@@ -387,10 +395,11 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         rotating_mem.Next();
                         // clear c mem
                         if(arg_.KBatch > 1)
-                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
-                                                             0,
-                                                             arg_.M * arg_.N * sizeof(CDataType),
-                                                             stream_config.stream_id_));
+                            hipGetErrorString(
+                                hipMemsetAsync(arg_.p_c_grid,
+                                               0,
+                                               arg.Batch * arg_.M * arg_.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
                     };
 
                     ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
@@ -889,7 +898,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                              index_t BatchStrideE,
                              AElementwiseOperation a_element_op,
                              BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             CElementwiseOperation c_element_op,
+                             index_t KBatch = 1)
     {
         return Argument{static_cast<const ADataType*>(p_a),
                         static_cast<const BDataType*>(p_b),
@@ -909,7 +919,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         Batch,
                         a_element_op,
                         b_element_op,
-                        c_element_op};
+                        c_element_op,
+                        KBatch};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -934,7 +945,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         index_t BatchStrideE,
                         AElementwiseOperation a_element_op,
                         BElementwiseOperation b_element_op,
-                        CElementwiseOperation c_element_op) override
+                        CElementwiseOperation c_element_op,
+                        index_t KBatch = 1) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
@@ -954,7 +966,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                           Batch,
                                           a_element_op,
                                           b_element_op,
-                                          c_element_op);
+                                          c_element_op,
+                                          KBatch);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index c7038ed4f..e5a31f8d1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -41,7 +41,7 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
         karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -76,7 +76,7 @@ __global__ void
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
     GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
         karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -639,27 +639,27 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
     struct SplitKBatchOffset
     {
-        __device__ SplitKBatchOffset(Argument& karg)
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
         {
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
             {
-                a_k_split_offset = blockIdx.z * karg.KRead;
+                a_k_split_offset = k_id * karg.KRead;
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
             {
-                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
             }
 
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
             {
-                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
-                b_k_split_offset = blockIdx.z * karg.KRead;
+                b_k_split_offset = k_id * karg.KRead;
             }
 
-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            if(k_id < karg.KBatch - 1)
             {
                 karg.K = karg.KRead;
             }
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index 5db041de0..21cef335c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -52,6 +52,9 @@ using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   160,    64,   8,   8,  16,   16,    8,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   160,    64,   8,   8,  32,   32,    1,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,          S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   160,   128,    64,   8,   8,  32,   32,    5,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index 355dc3212..552ac3cd0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -42,6 +42,7 @@ using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std
         //##################################|        |        |         |        | Type|  Type|         Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //##################################|        |        |         |        |     |      |             |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##################################|        |        |         |        |     |      |             |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
 #ifdef __gfx94__
         // Compute friendly
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -72,6 +73,7 @@ using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std:
         //##################################|        |        |         |        | Type|  Type|         Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //##################################|        |        |         |        |     |      |             |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //##################################|        |        |         |        |     |      |             |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
 #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
diff --git a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
index 53f81162a..f4300af8d 100644
--- a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
@@ -48,6 +48,7 @@ bool profile_gemm_universal_batched_impl(int do_verification,
                                          int StrideB,
                                          int StrideC,
                                          int BatchCount,
+                                         int KBatch,
                                          int n_warmup,
                                          int n_iter,
                                          uint64_t rotating = 0)
@@ -147,89 +148,100 @@ bool profile_gemm_universal_batched_impl(int do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
 
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
-        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
-        // false branch for multi d dl kernel
-
-        argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        {},
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                        M,
-                                        N,
-                                        K,
-                                        BatchCount,
-                                        StrideA,
-                                        StrideB,
-                                        {},
-                                        StrideC,
-                                        BatchStrideA,
-                                        BatchStrideB,
-                                        {},
-                                        BatchStrideC,
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{});
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            // re-init C to zero before profiling next kernel
-            c_device_buf.SetZero();
-
-            std::string op_name = op_ptr->GetTypeString();
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
 
-            float ave_time = invoker_ptr->Run(
-                argument_ptr.get(),
-                StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count});
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }
 
-            std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            {},
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            BatchCount,
+                                            StrideA,
+                                            StrideB,
+                                            {},
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            {},
+                                            BatchStrideC,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            kbatch_curr);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                std::string op_name = op_ptr->GetTypeString();
 
-            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                                     sizeof(CDataType) * M * N) *
-                                    BatchCount;
+                float ave_time = invoker_ptr->Run(
+                    argument_ptr.get(),
+                    StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count});
 
-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
 
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
+                std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                         sizeof(CDataType) * M * N) *
+                                        BatchCount;
 
-            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s, " << op_name << std::endl;
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
-            if(tflops > best_tflops)
-            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
 
-            if(do_verification)
-            {
-                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+                std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                          << " GB/s, " << op_name << ", KBatch " << kbatch_curr << std::endl;
 
-                pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+                if(tflops > best_tflops)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_kbatch     = kbatch_curr;
+                }
 
-                if(do_log)
+                if(do_verification)
                 {
-                    LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
-                        << std::endl;
+                    c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+
+                    pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
                 }
             }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
         }
     }
 
@@ -270,8 +282,8 @@ bool profile_gemm_universal_batched_impl(int do_verification,
 
     std::cout << " B = " << BatchCount << " M = " << M << " N = " << N << " K = " << K
               << " StrideA = " << StrideA << " StrideB = " << StrideB << " StrideC = " << StrideC
-              << ": " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
-              << " GB/s, " << best_op_name << std::endl;
+              << " KBatch = " << best_kbatch << ": " << best_ave_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
 
     return pass;
 }
diff --git a/profiler/src/profile_gemm_universal_batched.cpp b/profiler/src/profile_gemm_universal_batched.cpp
index 4afef8e55..d57511fbf 100644
--- a/profiler/src/profile_gemm_universal_batched.cpp
+++ b/profiler/src/profile_gemm_universal_batched.cpp
@@ -31,7 +31,7 @@ enum struct GemmDataType
 
 int profile_batched_gemm_universal(int argc, char* argv[])
 {
-    if(argc != 18 && argc != 21)
+    if(argc != 19 && argc != 22)
     {
         // clang-format off
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
@@ -44,11 +44,11 @@ int profile_batched_gemm_universal(int argc, char* argv[])
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg6: print tensor value (0: no; 1: yes)\n");
         printf("arg7: time kernel (0=n0, 1=yes)\n");
-        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        printf("arg8 to 18: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount, KBatch\n");
         printf("optional:\n");
-        printf("arg18: number of warm-up cycles (default 1)\n");
-        printf("arg19: number of iterations (default 10)\n");
-        printf("arg20: memory for rotating buffer (default 0, size in MB)\n");
+        printf("arg19: number of warm-up cycles (default 1)\n");
+        printf("arg20: number of iterations (default 10)\n");
+        printf("arg21: memory for rotating buffer (default 0, size in MB)\n");
         // clang-format on
         exit(1);
     }
@@ -56,11 +56,11 @@ int profile_batched_gemm_universal(int argc, char* argv[])
     int n_warmup      = 1;
     int n_iter        = 10;
     uint64_t rotating = 0;
-    if(argc == 21)
+    if(argc == 22)
     {
-        n_warmup = std::stoi(argv[18]);
-        n_iter   = std::stoi(argv[19]);
-        rotating = std::stoull(argv[20]) * 1024 * 1024;
+        n_warmup = std::stoi(argv[19]);
+        n_iter   = std::stoi(argv[20]);
+        rotating = std::stoull(argv[21]) * 1024 * 1024;
     }
 
     const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
@@ -83,6 +83,7 @@ int profile_batched_gemm_universal(int argc, char* argv[])
     const int BatchStrideC = std::stoi(argv[16]);
 
     const int BatchCount = std::stoi(argv[17]);
+    const int KBatch     = std::stoi(argv[18]);
 
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using F8 = ck::f8_t;
@@ -159,6 +160,7 @@ int profile_batched_gemm_universal(int argc, char* argv[])
                                                                                     StrideB_,
                                                                                     StrideC_,
                                                                                     BatchCount,
+                                                                                    KBatch,
                                                                                     n_warmup,
                                                                                     n_iter,
                                                                                     rotating);
-- 
GitLab


From 41ebf117a5927654a504803c19d18749babdeddd Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:30:22 -0800
Subject: [PATCH 118/153] Add zstd lib for building hipTensor. (#1745)

* add zstd library to CI docker

* fix the libzstd name
---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 8ce158a20..4329c54c1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -64,6 +64,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     nano \
     zlib1g-dev \
     zip \
+    libzstd-dev \
     openssh-server \
     clang-format-12 \
     kmod && \
-- 
GitLab


From d68974a5c68bd25bb8433302886213d7f5ff0d88 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:30:39 -0800
Subject: [PATCH 119/153] upgrade pandas package (#1746)

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 4329c54c1..83edbfb8e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -94,7 +94,7 @@ RUN pip install --upgrade cmake==3.27.5 && \
     dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
     pip3 install --upgrade pip && \
-    pip3 install sqlalchemy==1.4.46 pymysql pandas==2.0.3 setuptools-rust sshtunnel==0.4.0 && \
+    pip3 install sqlalchemy==1.4.46 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \
 # Add render group
     groupadd -f render && \
 # Install the new rocm-cmake version
-- 
GitLab


From f57d720c67123b43cb6f18f4b8b5aa0c7c9f51ba Mon Sep 17 00:00:00 2001
From: "Xu, Shengnan" <117875955+shengnxu@users.noreply.github.com>
Date: Sun, 15 Dec 2024 20:13:10 +0800
Subject: [PATCH 120/153] added moe interleaving pipeline (#1712)

* added moe interleaving pipeline

* remove redundant code

* formater

---------

Co-authored-by: root <root@hjbog-srdc-14.amd.com>
---
 include/ck_tile/ops/flatmm.hpp                |   1 +
 ...latmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp | 510 +++++++++++++
 ..._uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc | 708 ++++++++++++++++++
 .../fused_moegemm_pipeline_flatmm_policy.hpp  |  29 +-
 .../pipeline/fused_moegemm_traits.hpp         |   4 +-
 5 files changed, 1249 insertions(+), 3 deletions(-)
 create mode 100644 include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
 create mode 100644 include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc

diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp
index eee80cda4..ba76e3070 100644
--- a/include/ck_tile/ops/flatmm.hpp
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
new file mode 100644
index 000000000..681a69603
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
+#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
+
+namespace ck_tile {
+
+// "S"tream update output along "N"
+// A in smem, B load from global
+// require 4 wave, occupancy=1c
+
+struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
+{
+    using BDataType = bf16_t;
+    using ODataType = bf16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    // template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
+    template <typename BRes,
+              typename BCoords,
+              typename ORes,
+              typename OCoords,
+              typename OFlags,
+              typename ScaleTensor>
+    CK_TILE_DEVICE auto
+    operator()(const BRes& res_b,
+               const BCoords& cached_coords_b,
+               const ORes& res_o,
+               const OCoords& cached_coords_o,
+               const OFlags& o_flags, // this should be in sgpr
+               CK_TILE_LDS_ADDR void* smem,
+               index_t n, // loop along n dim
+               const ScaleTensor& scale_,
+               index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
+               index_t tile_offset_o)
+    {
+        static_assert(BCoords::size() == 8); // 8
+        static_assert(OCoords::size() == 8);
+
+        const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
+        const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
+
+        static_assert(ScaleTensor::size() == 2);
+        float s0 = scale_[number<0>{}];
+        float s1 = scale_[number<1>{}];
+
+        // index_t loop_cnt = n / Block_N;
+
+        register float v_c0 asm("v64");
+        register float v_c1 asm("v65");
+        register float v_c2 asm("v66");
+        register float v_c3 asm("v67");
+        register float v_c4 asm("v68");
+        register float v_c5 asm("v69");
+        register float v_c6 asm("v70");
+        register float v_c7 asm("v71");
+        register float v_c8 asm("v72");
+        register float v_c9 asm("v73");
+        register float v_c10 asm("v74");
+        register float v_c11 asm("v75");
+        register float v_c12 asm("v76");
+        register float v_c13 asm("v77");
+        register float v_c14 asm("v78");
+        register float v_c15 asm("v79");
+        register float v_c16 asm("v80");
+        register float v_c17 asm("v81");
+        register float v_c18 asm("v82");
+        register float v_c19 asm("v83");
+        register float v_c20 asm("v84");
+        register float v_c21 asm("v85");
+        register float v_c22 asm("v86");
+        register float v_c23 asm("v87");
+        register float v_c24 asm("v88");
+        register float v_c25 asm("v89");
+        register float v_c26 asm("v90");
+        register float v_c27 asm("v91");
+        register float v_c28 asm("v92");
+        register float v_c29 asm("v93");
+        register float v_c30 asm("v94");
+        register float v_c31 asm("v95");
+        int32_t nan_hi = 0x7fff0000;
+        int32_t nan_lo = 0x00007fff;
+
+        // in smem, the layout is  M0(2)*K0(128)*M1(16)*K1(4)
+        // every threads need 8xK in contiguous register
+        // ... and every wave need the same data
+        int lane_id  = threadIdx.x % 64;
+        int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
+        sld_y_os *= 2;
+
+        //                    y     y     p     p      p      y
+        // reg before shfl  M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
+        // but order is N0*M0*Nv
+        // in LDS we need store as
+        //          M0(2)* N0(2) *  Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
+        //             y    y       wave-id  lid/16  lid%16   v
+        // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
+        int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
+        sfl_sst *= 2;
+
+        // from LDS we need load as
+        //          M0(2)*    N0(2) *  Nl(4) * Nw(4) * (Mw(16)         *  Nv(4) + 4)
+        //        ( 2 issue)    (rem 32-lane)        (4 wave*4issue)   2lane*1ussue(pk2)
+        // sld(v4) = v0/2 *34*4  + v0 % 2 *4 + wid*2 *4
+        int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
+        sfl_sld *= 2;
+
+        // B nr->kr
+        // clang-format off
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :[smem_]"+r"(smem),
+            // [s_loop_cnt]"+s"(loop_cnt),
+            [s_loop_cnt]"+s"(n),
+                [c0]"+v" (v_c0),
+                [c1]"+v" (v_c1),
+                [c2]"+v" (v_c2),
+                [c3]"+v" (v_c3),
+                [c4]"+v" (v_c4),
+                [c5]"+v" (v_c5),
+                [c6]"+v" (v_c6),
+                [c7]"+v" (v_c7),
+                [c8]"+v" (v_c8),
+                [c9]"+v" (v_c9),
+                [c10]"+v"(v_c10),
+                [c11]"+v"(v_c11),
+                [c12]"+v"(v_c12),
+                [c13]"+v"(v_c13),
+                [c14]"+v"(v_c14),
+                [c15]"+v"(v_c15),
+                [c16]"+v"(v_c16),
+                [c17]"+v"(v_c17),
+                [c18]"+v"(v_c18),
+                [c19]"+v"(v_c19),
+                [c20]"+v"(v_c20),
+                [c21]"+v"(v_c21),
+                [c22]"+v"(v_c22),
+                [c23]"+v"(v_c23),
+                [c24]"+v"(v_c24),
+                [c25]"+v"(v_c25),
+                [c26]"+v"(v_c26),
+                [c27]"+v"(v_c27),
+                [c28]"+v"(v_c28),
+                [c29]"+v"(v_c29),
+                [c30]"+v"(v_c30),
+                [c31]"+v"(v_c31)
+            :
+            [sld_a_base]"n"(0),
+            [shfl_base]"n"(0),
+            [v_sld_y_os]"v"(sld_y_os),
+            [v_sfl_sld]"v"(sfl_sld),
+            [v_sfl_sst]"v"(sfl_sst),
+            [s_res_o0]"s"(res_o[0]),
+                [s_res_o1]"s"(res_o[1]),
+                //[s_res_o2]"s"(res_o[2]),
+                //[s_res_o3]"s"(res_o[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
+                [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
+                [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
+                [v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
+                [v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
+                [v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
+                [v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
+                [v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [s_tile_os_o]"s"(tile_stride_o_bytes),
+                [s_tile_os_b]"s"(tile_stride_b_bytes),
+                [scale_0]"v"(s0),
+                [scale_1]"v"(s1),
+                [v_nan_lo]"v"(nan_lo),
+                [v_nan_hi]"v"(nan_hi),
+                [s_execflag_0]"s"(o_flags[number<0>{}]),
+                [s_execflag_1]"s"(o_flags[number<1>{}]),
+                [s_execflag_2]"s"(o_flags[number<2>{}]),
+                [s_execflag_3]"s"(o_flags[number<3>{}]),
+                [s_execflag_4]"s"(o_flags[number<4>{}]),
+                [s_execflag_5]"s"(o_flags[number<5>{}]),
+                [s_execflag_6]"s"(o_flags[number<6>{}]),
+                [s_execflag_7]"s"(o_flags[number<7>{}])
+            :
+          "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
+          "s36", "s37","s59","s80",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+          "v50", "v54", "v55",
+          "v64","v65","v66","v67","v68","v69","v70","v71",
+          "v72","v73","v74","v75","v76","v77","v78","v79",
+          "v80","v81","v82","v83","v84","v85","v86","v87",
+          "v88","v89","v90","v91","v92","v93","v94","v95",
+          "v128", "v129", "v130", "v131",
+          "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
+          "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
+          "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
+          "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
+          "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
+          "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+          "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
+          "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
+          "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
+          "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
+          "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
+          "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
+          "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
+          "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
+          "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+          "v252", "v253", "v254", "v255"
+        );
+#pragma clang diagnostic pop
+        // clang-format on
+    }
+};
+
+struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
+{
+    using BDataType = bf16_t;
+    using ODataType = bf16_t;
+
+    // TODO: need paired with tile_window_linear!
+    // TODO: need call init_raw() before call this function!
+    // template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
+    template <typename BRes,
+              typename BCoords,
+              typename ORes,
+              typename OCoords,
+              typename OFlags,
+              typename ScaleTensor>
+    CK_TILE_DEVICE auto
+    operator()(const BRes& res_b,
+               const BCoords& cached_coords_b,
+               const ORes& res_o,
+               const OCoords& cached_coords_o,
+               const OFlags& o_flags, // this should be in sgpr
+               CK_TILE_LDS_ADDR void* smem,
+               index_t n, // loop along n dim
+               const ScaleTensor& scale_,
+               index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
+               index_t tile_offset_o)
+    {
+        static_assert(BCoords::size() == 8); // 8
+        static_assert(OCoords::size() == 8);
+
+        const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
+        const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
+
+        static_assert(ScaleTensor::size() == 2);
+        float s0 = scale_[number<0>{}];
+        float s1 = scale_[number<1>{}];
+
+        // index_t loop_cnt = n / Block_N;
+
+        register float v_c0 asm("v64");
+        register float v_c1 asm("v65");
+        register float v_c2 asm("v66");
+        register float v_c3 asm("v67");
+        register float v_c4 asm("v68");
+        register float v_c5 asm("v69");
+        register float v_c6 asm("v70");
+        register float v_c7 asm("v71");
+        register float v_c8 asm("v72");
+        register float v_c9 asm("v73");
+        register float v_c10 asm("v74");
+        register float v_c11 asm("v75");
+        register float v_c12 asm("v76");
+        register float v_c13 asm("v77");
+        register float v_c14 asm("v78");
+        register float v_c15 asm("v79");
+        register float v_c16 asm("v80");
+        register float v_c17 asm("v81");
+        register float v_c18 asm("v82");
+        register float v_c19 asm("v83");
+        register float v_c20 asm("v84");
+        register float v_c21 asm("v85");
+        register float v_c22 asm("v86");
+        register float v_c23 asm("v87");
+        register float v_c24 asm("v88");
+        register float v_c25 asm("v89");
+        register float v_c26 asm("v90");
+        register float v_c27 asm("v91");
+        register float v_c28 asm("v92");
+        register float v_c29 asm("v93");
+        register float v_c30 asm("v94");
+        register float v_c31 asm("v95");
+        int32_t nan_hi = 0x7fff0000;
+        int32_t nan_lo = 0x00007fff;
+
+        // in smem, the layout is  M0(2)*K0(128)*M1(16)*K1(4)
+        // every threads need 8xK in contiguous register
+        // ... and every wave need the same data
+        int lane_id  = threadIdx.x % 64;
+        int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
+        sld_y_os *= 2;
+
+        //                    y     y     p     p      p      y
+        // reg before shfl  M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
+        // but order is N0*M0*Nv
+        // in LDS we need store as
+        //          M0(2)* N0(2) *  Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
+        //             y    y       wave-id  lid/16  lid%16   v
+        // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
+        int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
+        sfl_sst *= 2;
+
+        // from LDS we need load as
+        //          M0(2)*    N0(2) *  Nl(4) * Nw(4) * (Mw(16)         *  Nv(4) + 4)
+        //        ( 2 issue)    (rem 32-lane)        (4 wave*4issue)   2lane*1ussue(pk2)
+        // sld(v4) = v0/2 *34*4  + v0 % 2 *4 + wid*2 *4
+        int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
+        sfl_sld *= 2;
+
+        // B nr->kr
+        // clang-format off
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winline-asm"
+        asm volatile(
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
+#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
+#undef CK_TILE_FLATMM_UK_MFMA
+            :[smem_]"+r"(smem),
+            [s_loop_cnt]"+s"(n),
+                [c0]"+v" (v_c0),
+                [c1]"+v" (v_c1),
+                [c2]"+v" (v_c2),
+                [c3]"+v" (v_c3),
+                [c4]"+v" (v_c4),
+                [c5]"+v" (v_c5),
+                [c6]"+v" (v_c6),
+                [c7]"+v" (v_c7),
+                [c8]"+v" (v_c8),
+                [c9]"+v" (v_c9),
+                [c10]"+v"(v_c10),
+                [c11]"+v"(v_c11),
+                [c12]"+v"(v_c12),
+                [c13]"+v"(v_c13),
+                [c14]"+v"(v_c14),
+                [c15]"+v"(v_c15),
+                [c16]"+v"(v_c16),
+                [c17]"+v"(v_c17),
+                [c18]"+v"(v_c18),
+                [c19]"+v"(v_c19),
+                [c20]"+v"(v_c20),
+                [c21]"+v"(v_c21),
+                [c22]"+v"(v_c22),
+                [c23]"+v"(v_c23),
+                [c24]"+v"(v_c24),
+                [c25]"+v"(v_c25),
+                [c26]"+v"(v_c26),
+                [c27]"+v"(v_c27),
+                [c28]"+v"(v_c28),
+                [c29]"+v"(v_c29),
+                [c30]"+v"(v_c30),
+                [c31]"+v"(v_c31)
+            :
+            [sld_a_base]"n"(0),
+            [shfl_base]"n"(0),
+            [v_sld_y_os]"v"(sld_y_os),
+            [v_sfl_sld]"v"(sfl_sld),
+            [v_sfl_sst]"v"(sfl_sst),
+            [s_res_o0]"s"(res_o[0]),
+                [s_res_o1]"s"(res_o[1]),
+                //[s_res_o2]"s"(res_o[2]),
+                //[s_res_o3]"s"(res_o[3]),
+                [s_res_b0]"s"(res_b[0]),
+                [s_res_b1]"s"(res_b[1]),
+                [s_res_b2]"s"(res_b[2]),
+                [s_res_b3]"s"(res_b[3]),
+                [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
+                [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
+                [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
+                [v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
+                [v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
+                [v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
+                [v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
+                [v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
+                [v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
+                [v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
+                [v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
+                [v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
+                [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
+                [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
+                [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
+                [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
+
+                [s_tile_os_o]"s"(tile_stride_o_bytes),
+                [s_tile_os_b]"s"(tile_stride_b_bytes),
+                [scale_0]"v"(s0),
+                [scale_1]"v"(s1),
+                [v_nan_lo]"v"(nan_lo),
+                [v_nan_hi]"v"(nan_hi),
+                [s_execflag_0]"s"(o_flags[number<0>{}]),
+                [s_execflag_1]"s"(o_flags[number<1>{}]),
+                [s_execflag_2]"s"(o_flags[number<2>{}]),
+                [s_execflag_3]"s"(o_flags[number<3>{}]),
+                [s_execflag_4]"s"(o_flags[number<4>{}]),
+                [s_execflag_5]"s"(o_flags[number<5>{}]),
+                [s_execflag_6]"s"(o_flags[number<6>{}]),
+                [s_execflag_7]"s"(o_flags[number<7>{}])
+            :
+          "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
+          "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
+          "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
+          "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
+          "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
+          "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
+          "a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
+          "a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
+          "a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
+          "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
+          "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
+          "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
+          "a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
+          "a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
+          "a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
+          "a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
+          "a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
+          "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
+          "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
+          "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
+          "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
+          "a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
+          "a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
+          "a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
+          "a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
+          "a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
+          "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
+          "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
+          "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
+          "a252", "a253", "a254", "a255", 
+          "s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
+          "s36", "s37","s59","s80",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+          "v50", "v54", "v55",
+          "v64","v65","v66","v67","v68","v69","v70","v71",
+          "v72","v73","v74","v75","v76","v77","v78","v79",
+          "v80","v81","v82","v83","v84","v85","v86","v87",
+          "v88","v89","v90","v91","v92","v93","v94","v95",
+          "v128", "v129", "v130", "v131",
+          "v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
+          "v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
+          "v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
+          "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
+          "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
+          "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+          "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
+          "v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
+          "v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
+          "v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
+          "v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
+          "v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
+          "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
+          "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
+          "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+          "v252", "v253", "v254", "v255"
+        );
+#pragma clang diagnostic pop
+        // clang-format on
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
new file mode 100644
index 000000000..b8c6d2002
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
@@ -0,0 +1,708 @@
+#ifndef CK_TILE_FLATMM_UK_MFMA
+#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
+#endif
+
+#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
+# define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
+
+# define _UK_PK_CVT_(x0_, x1_, y_) \
+ " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
+ " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
+ " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
+ " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
+ " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
+ " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
+ " v_perm_b32 " y_ ", v55, v54, s52 \n"
+
+# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
+
+#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
+#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
+
+# define _UK_PK_CVT_(x0_, x1_, y_) \
+ " v_cvt_f16_f32 v54, " x0_ " \n" \
+ " v_cvt_f16_f32 v55, " x1_ " \n" \
+ " v_pack_b32_f16 " y_ ", v54, v55 \n"
+
+# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
+
+#endif
+
+
+";-------------------------------------------------------------\n"
+" s_mov_b32 s52, 0x07060302 ; v_perm\n"
+" s_mov_b64 s[38:39], exec ; save current exec\n"
+" s_mov_b32 s8, %[s_res_o0] \n"
+" s_mov_b32 s9, %[s_res_o1] \n"
+" s_mov_b32 s12, %[s_res_b0] \n"
+" s_mov_b32 s13, %[s_res_b1] \n"
+" s_mov_b32 s14, %[s_res_b2] \n"
+" s_mov_b32 s15, %[s_res_b3] \n" 
+" s_mov_b32 s59, 0 \n"
+" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
+" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
+" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
+" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
+" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
+" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
+" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
+" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
+" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
+" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
+" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
+" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
+" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
+" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
+" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
+" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
+" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
+" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
+" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
+" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
+" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
+" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
+" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
+" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
+" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
+" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
+" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
+" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
+" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
+" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
+" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
+" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
+" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
+" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
+" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
+" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
+" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
+" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
+" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
+" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
+" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
+" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
+" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
+" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
+" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
+" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
+" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
+" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
+" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
+" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
+" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
+" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
+" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
+" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
+" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
+" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
+" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
+" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
+" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
+" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
+" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
+" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
+" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
+" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n" 
+" s_waitcnt 0 \n"
+" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
+" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+" s_add_u32 s12, %[s_tile_os_b], s12 \n" 
+" s_addc_u32 s13, 0, s13 \n" 
+" v_mov_b32 v64, 0 \n"
+" v_mov_b32 v80, 0 \n"
+" v_mov_b32 v65, 0 \n"
+" v_mov_b32 v81, 0 \n"
+" v_mov_b32 v66, 0 \n"
+" v_mov_b32 v82, 0 \n"
+" v_mov_b32 v67, 0 \n"
+" v_mov_b32 v83, 0 \n"
+" v_mov_b32 v68, 0 \n"
+" v_mov_b32 v84, 0 \n"
+" v_mov_b32 v69, 0 \n"
+" v_mov_b32 v85, 0 \n"
+" v_mov_b32 v70, 0 \n"
+" v_mov_b32 v86, 0 \n"
+" v_mov_b32 v71, 0 \n"
+" v_mov_b32 v87, 0 \n"
+" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168  \n"
+" s_mov_b32 s80, 0 \n" 
+" s_waitcnt vmcnt(24) \n" 
+"label_0AA6: \n"
+" s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
+" s_barrier \n" 
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[128:131],  %[v_os_b0], s[12:15], 0 offen  \n" 
+" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[132:135],  %[v_os_b0], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[136:139],  %[v_os_b0], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[140:143],  %[v_os_b0], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]  \n"
+ " s_waitcnt lgkmcnt(0) \n"
+ " s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[144:147],  %[v_os_b1], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[148:151],  %[v_os_b1], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[152:155],  %[v_os_b1], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[156:159],  %[v_os_b1], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]  \n"
+" s_waitcnt vmcnt(30)   \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[160:163],  %[v_os_b2], s[12:15], 0 offen  \n" 
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[164:167],  %[v_os_b2], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[168:171],  %[v_os_b2], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[172:175],  %[v_os_b2], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]  \n"
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[176:179],  %[v_os_b3], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[180:183],  %[v_os_b3], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[184:187],  %[v_os_b3], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[188:191],  %[v_os_b3], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]    \n"
+" s_waitcnt vmcnt(30)   \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[192:195],  %[v_os_b4], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[196:199],  %[v_os_b4], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[200:203],  %[v_os_b4], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[204:207],  %[v_os_b4], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71]  \n"
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[208:211],  %[v_os_b5], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[212:215],  %[v_os_b5], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[216:219],  %[v_os_b5], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[220:223],  %[v_os_b5], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30)   \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[224:227],  %[v_os_b6], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67]  \n"
+ " buffer_load_dwordx4 acc[228:231],  %[v_os_b6], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67]  \n"
+ _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[232:235],  %[v_os_b6], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71]  \n"
+ " buffer_load_dwordx4 acc[236:239],  %[v_os_b6], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71]  \n"
+ _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71]  \n"
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[240:243],  %[v_os_b7], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75]  \n"
+ " buffer_load_dwordx4 acc[244:247],  %[v_os_b7], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75]  \n"
+ _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[248:251],  %[v_os_b7], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79]  \n"
+ " buffer_load_dwordx4 acc[252:255],  %[v_os_b7], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79]  \n"
+ _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79]  \n"
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_add_u32 s60, 0x00000100, s80  \n"
+" s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
+" s_cselect_b32 s56, %[s_tile_os_b], 0  \n"
+" s_add_u32 s12, s56, s12  \n"
+" s_addc_u32 s13, 0, s13  \n"
+" s_cmp_ge_u32 s80, 0x00000100  \n"
+" s_cselect_b32 s59, %[s_tile_os_o], s59  \n"
+" s_add_u32 s8, s59, s8  \n"
+" s_addc_u32 s9, 0, s9  \n"
+" v_mul_f32 %[c0], %[scale_0], %[c0] \n"
+" v_mul_f32 %[c1], %[scale_0], %[c1] \n"
+" v_mul_f32 %[c2], %[scale_0], %[c2] \n"
+" v_mul_f32 %[c3], %[scale_0], %[c3] \n"
+" v_mul_f32 %[c4], %[scale_1], %[c4] \n"
+" v_mul_f32 %[c5], %[scale_1], %[c5] \n"
+" v_mul_f32 %[c6], %[scale_1], %[c6] \n"
+" v_mul_f32 %[c7], %[scale_1], %[c7] \n"
+" v_mul_f32 %[c8], %[scale_0], %[c8] \n"
+" v_mul_f32 %[c9], %[scale_0], %[c9] \n"
+" v_mul_f32 %[c10], %[scale_0], %[c10] \n"
+" v_mul_f32 %[c11], %[scale_0], %[c11] \n"
+" v_mul_f32 %[c12], %[scale_1], %[c12] \n"
+" v_mul_f32 %[c13], %[scale_1], %[c13] \n"
+" v_mul_f32 %[c14], %[scale_1], %[c14] \n"
+" v_mul_f32 %[c15], %[scale_1], %[c15] \n"
+_UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]")
+_UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]")
+_UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]")
+_UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]")
+_UK_PK_CVT_("%[c8]", "%[c9]", "%[c4]")
+_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
+_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
+_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]")
+" s_addk_i32 s80, 0x0080  \n"
+" s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
+" s_cbranch_scc0 label_0EC1  \n"
+" s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
+" s_barrier  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
+" ds_write_b64 v3, v[64:65] offset:16640  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[0:3],  %[v_os_b0], s[12:15], 0 offen  \n"
+" ds_write_b64 v3, v[66:67] offset:20992  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
+" ds_write_b64 v3, v[68:69] offset:18816  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]  \n"
+" ds_write_b64 v3, v[70:71] offset:23168  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[4:7],  %[v_os_b0], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[8:11],  %[v_os_b0], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[12:15],  %[v_os_b0], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]  \n"
+ " s_waitcnt lgkmcnt(0) \n" 
+ " s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[16:19],  %[v_os_b1], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[20:23],  %[v_os_b1], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[24:27],  %[v_os_b1], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[28:31],  %[v_os_b1], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30) \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[32:35],  %[v_os_b2], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[36:39],  %[v_os_b2], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[40:43],  %[v_os_b2], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[44:47],  %[v_os_b2], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]  \n"
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[48:51],  %[v_os_b3], s[12:15], 0 offen  \n" 
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[52:55],  %[v_os_b3], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[56:59],  %[v_os_b3], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[60:63],  %[v_os_b3], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30) \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[64:67],  %[v_os_b4], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[68:71],  %[v_os_b4], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[72:75],  %[v_os_b4], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[76:79],  %[v_os_b4], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87]  \n"
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[80:83],  %[v_os_b5], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[84:87],  %[v_os_b5], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[88:91],  %[v_os_b5], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[92:95],  %[v_os_b5], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_waitcnt vmcnt(30)  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[96:99],  %[v_os_b6], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83]  \n"
+ " buffer_load_dwordx4 acc[100:103],  %[v_os_b6], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], v[188:189], v[80:83]  \n"
+ _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[104:107],  %[v_os_b6], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], v[244:245], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87]  \n"
+ " buffer_load_dwordx4 acc[108:111],  %[v_os_b6], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], v[252:253], v[84:87]  \n"
+ _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87]  \n"
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[112:115],  %[v_os_b7], s[12:15], 0 offen  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91]  \n"
+ " buffer_load_dwordx4 acc[116:119],  %[v_os_b7], s[12:15], 0 offen offset:1024  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], v[188:189], v[88:91]  \n"
+ _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[120:123],  %[v_os_b7], s[12:15], 0 offen offset:2048  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], v[244:245], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95]  \n"
+ " buffer_load_dwordx4 acc[124:127],  %[v_os_b7], s[12:15], 0 offen offset:3072  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], v[252:253], v[92:95]  \n"
+ _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95]  \n"
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_add_u32 s60, 0x00000100, s80  \n"
+" s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
+" s_cselect_b32 s56, s56, 0  \n"
+" s_add_u32 s12, s56, s12  \n"
+" s_addc_u32 s13, 0, s13  \n"
+" s_cmp_ge_u32 s80, 0x00000100  \n"
+" s_cselect_b32 s59, 0x00000100, s59  \n"
+" s_add_u32 s8, s59, s8  \n"
+" s_addc_u32 s9, 0, s9  \n"
+" v_mul_f32 %[c16], %[scale_0], %[c16] \n"
+" v_mul_f32 %[c17], %[scale_0], %[c17] \n"
+" v_mul_f32 %[c18], %[scale_0], %[c18] \n"
+" v_mul_f32 %[c19], %[scale_0], %[c19] \n"
+" v_mul_f32 %[c20], %[scale_1], %[c20] \n"
+" v_mul_f32 %[c21], %[scale_1], %[c21] \n"
+" v_mul_f32 %[c22], %[scale_1], %[c22] \n"
+" v_mul_f32 %[c23], %[scale_1], %[c23] \n"
+" v_mul_f32 %[c24], %[scale_0], %[c24] \n"
+" v_mul_f32 %[c25], %[scale_0], %[c25] \n"
+" v_mul_f32 %[c26], %[scale_0], %[c26] \n"
+" v_mul_f32 %[c27], %[scale_0], %[c27] \n"
+" v_mul_f32 %[c28], %[scale_1], %[c28] \n"
+" v_mul_f32 %[c29], %[scale_1], %[c29] \n"
+" v_mul_f32 %[c30], %[scale_1], %[c30] \n"
+" v_mul_f32 %[c31], %[scale_1], %[c31] \n"
+_UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]")
+_UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]")
+_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]")
+_UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]")
+_UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]")
+_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]")
+_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]")
+_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]")
+" s_addk_i32 s80, 0x0080  \n"
+" s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
+" s_cbranch_scc0 label_0EC1  \n"
+" s_branch label_0AA6  \n"
+" label_0EC1: \n"
+" s_waitcnt lgkmcnt(0)  \n"
+" s_barrier  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n"
+" s_waitcnt lgkmcnt(0)  \n"
+ " s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]                           \n"
+" s_add_u32 s8, s59, s8  \n"
+" s_addc_u32 s9, 0, s9  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520  \n"
+" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872  \n"
+" s_waitcnt lgkmcnt(0)  \n"
+" s_barrier  \n"
+" ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
+" ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
+" ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
+" ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
+" ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
+" ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n"
+" ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
+" ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n"
+" s_waitcnt lgkmcnt(0)  \n"
+" s_mov_b64 exec, %[s_execflag_0] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_1] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_2] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_3] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_4] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_5] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_6] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
+ " s_mov_b64 exec, %[s_execflag_7] \n"
+_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
+"  s_mov_b64     exec, s[38:39]  \n"
+
+#undef _UK_MFMA_ 
+#undef _UK_PK_CVT_
+#undef _UK_ATOMIC_ADD_
+
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
index fea30f029..629f0ee8f 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -810,21 +810,46 @@ struct FusedMoeGemmPipelineFlatmmPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetUK_1()
     {
         using S_ = typename Problem::BlockShape;
+        using T_ = typename Problem::Traits;
         if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
                      std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
                      std::is_same_v<typename Problem::TopkWeightDataType, float> &&
                      S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
-                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                     T_::PipeInterleave == false)
         {
             return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16{};
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{};
         }
         else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::fp16_t> &&
                           std::is_same_v<typename Problem::DDataType, ck_tile::fp16_t> &&
                           std::is_same_v<typename Problem::TopkWeightDataType, float> &&
                           S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
-                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == false)
         {
             return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == true)
+        {
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == true)
+        {
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{};
         }
     }
 };
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
index d7127b098..3fb82bc09 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
@@ -22,7 +22,8 @@ template <bool IsGateOnly_,
           FusedMoeGemmWeightPermuteEnum PermuteEnum_ =
               FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten,
           bool PadHiddenSize_       = false,
-          bool PadIntermediateSize_ = false>
+          bool PadIntermediateSize_ = false,
+          bool PipeInterleave_      = true>
 struct FusedMoeGemmTraits
 {
     // Gate+Up or Gate only
@@ -32,6 +33,7 @@ struct FusedMoeGemmTraits
     static constexpr FusedMoeGemmWeightPermuteEnum PermuteEnum = PermuteEnum_;
     static constexpr bool PadHiddenSize                        = PadHiddenSize_;
     static constexpr bool PadIntermediateSize                  = PadIntermediateSize_;
+    static constexpr bool PipeInterleave                       = PipeInterleave_;
 };
 
 // Note: this need to be a bit mask
-- 
GitLab


From fdfe2102304f62ec62194706a5f67766ae824dc6 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Sun, 15 Dec 2024 16:25:21 -0800
Subject: [PATCH 121/153] upgrade sqlalchemy version (#1748)

* upgrade sqlalchemy version

* replace the connection with engine in to_sql call

* change the hipTes=nsor ctest syntax
---
 Dockerfile                  | 2 +-
 Jenkinsfile                 | 4 +---
 script/process_perf_data.py | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 83edbfb8e..a3bf3866b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -94,7 +94,7 @@ RUN pip install --upgrade cmake==3.27.5 && \
     dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
     pip3 install --upgrade pip && \
-    pip3 install sqlalchemy==1.4.46 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \
+    pip3 install sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust sshtunnel==0.4.0 && \
 # Add render group
     groupadd -f render && \
 # Install the new rocm-cmake version
diff --git a/Jenkinsfile b/Jenkinsfile
index f82c34afa..87c9457fc 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -566,11 +566,9 @@ def Build_CK(Map conf=[:]){
                                 ls -ltr
                                 CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install"
                                 cmake --build build -- -j
+                                ctest --test-dir build
                             """
                         }
-                        dir("hipTensor-${params.hipTensor_branch}/build"){
-                            sh 'ctest'
-                        }
                     }
                 }
             }
diff --git a/script/process_perf_data.py b/script/process_perf_data.py
index fbfec94ee..32e2e15d7 100644
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -332,7 +332,7 @@ def main():
             table_name="ck_fmha_bwd_tflops"
 
         tflops_base = get_baseline(table_name,conn)
-        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)
+        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine)
         conn.close()
 
     #compare the results to the baseline if baseline exists
-- 
GitLab


From a8ad7fcce912c8e462ca69d5ca680d99b2ef56dd Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Dec 2024 18:14:52 +0000
Subject: [PATCH 122/153] add template placeholders

---
 .github/CONTRIBUTING.md          |  0
 .github/ISSUE_TEMPLATE.md        | 14 ++++++++++++++
 .github/PULL_REQUEST_TEMPLATE.md |  0
 3 files changed, 14 insertions(+)
 create mode 100644 .github/CONTRIBUTING.md
 create mode 100644 .github/ISSUE_TEMPLATE.md
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 000000000..263cc3480
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,14 @@
+When creating an issue, please check if a similar issue already exists.
+
+### When reporting a bug, please include:
+- [ ] A descriptive title
+- [ ] An isolated way to reproduce the behavior (preferably a docker container with a repro)
+- [ ] ROCm version, clang version, Composable Kernel commit pin
+- [ ] Environment variables
+- [ ] The behavior you expect to see, and the behavior you actually see
+
+### When requesting a feature, please include:
+- [ ] A descriptive title
+- [ ] A detailed description of the problem you are trying to solve
+- [ ] An overview of the suggested solution
+- [ ] Explanation why the solution is an improvement
\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..e69de29bb
-- 
GitLab


From 30a37cac0e76298ef184597b1f7d3ef0d3f4bb60 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Dec 2024 18:50:27 +0000
Subject: [PATCH 123/153] add pull request template placeholder

---
 .github/PULL_REQUEST_TEMPLATE.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index e69de29bb..c5161f7f8 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,19 @@
+## Proposed changes
+
+Please describe the motivation behind the pull request, whether it enables a new feature or fixes a bug. If there are associated pull requests or issues, please link them to the pull request.
+
+## Checklist
+
+Please put an `x` into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask.
+
+- [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally
+- [ ] I have added inline documentation which enables the maintainers with understanding the motivation
+- [ ] I have removed the stale documentation which is no longer relevant after this pull request
+- [ ] I have added release notes which provide the end users with a brief summary of the improvement from this pull request
+- [ ] I have run `clang-format` on all changed files
+- [ ] Any dependent changes have been merged
+
+## Discussion
+
+If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered
+
-- 
GitLab


From 1b75c77da41afdfa8cff30a40bbe0fc4bd1d643f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Dec 2024 19:14:37 +0000
Subject: [PATCH 124/153] add contributing placeholder

---
 .github/CONTRIBUTING.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index e69de29bb..56f2acee7 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,10 @@
+We'd love for you to contribute to our source code!
+
+Some helpful links:
+
+- [Code of Conduct guidelines](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.txt)
+- [New issue guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/ISSUE_TEMPLATE.md)
+- [Submitting a pull request guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/PULL_REQUEST_TEMPLATE.md)
+- [Maintainers](https://github.com/rocm/composable_kernel/blob/develop/CONTRIBUTORS.md)
+- [General information](https://github.com/rocm/composable_kernel/blob/develop/README.md)
+- [ROCm documentation](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.html)
\ No newline at end of file
-- 
GitLab


From 0fd6978d2a3c5973d9c0486616b2a71ea7aa5f86 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Dec 2024 20:29:49 +0000
Subject: [PATCH 125/153] clarify release notes bullet point

---
 .github/PULL_REQUEST_TEMPLATE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index c5161f7f8..b3fcabec3 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -9,7 +9,7 @@ Please put an `x` into the boxes that apply. You can also fill these out after c
 - [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally
 - [ ] I have added inline documentation which enables the maintainers with understanding the motivation
 - [ ] I have removed the stale documentation which is no longer relevant after this pull request
-- [ ] I have added release notes which provide the end users with a brief summary of the improvement from this pull request
+- [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request
 - [ ] I have run `clang-format` on all changed files
 - [ ] Any dependent changes have been merged
 
-- 
GitLab


From d46196f291a33539a089d7d09bcbc4d2270733c2 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Tue, 17 Dec 2024 09:19:44 +0100
Subject: [PATCH 126/153] Enhance printing functionality (#1751)

* Added object print with all template parameters

* fix clang format

---------

Co-authored-by: ravil-mobile <ravil.aviva.com@gmail.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
---
 .../gpu/device/device_base.hpp                |   34 +
 .../impl/device_gemm_xdl_cshuffle_v3.hpp      |    1 +
 ...m_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc | 1383 +++++++++-------
 ..._uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc | 1439 +++++++++--------
 ...atmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc | 1007 ++++++------
 .../profiler/profile_gemm_universal_impl.hpp  |   18 +-
 6 files changed, 2095 insertions(+), 1787 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index 908ada016..736e241fd 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -5,6 +5,8 @@
 
 #include <string>
 #include <sstream>
+#include <regex>
+#include <optional>
 
 #include "ck/stream_config.hpp"
 
@@ -12,6 +14,34 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+#define GET_OBJECT_NAME_IMLP                                                  \
+    std::optional<std::string> GetObjectName() const override                 \
+    {                                                                         \
+        std::string str = __PRETTY_FUNCTION__;                                \
+        static std::regex obj_name_expr{"<std::string> (.*)::GetObjectName"}; \
+        std::smatch match;                                                    \
+        if(!std::regex_search(str, match, obj_name_expr))                     \
+        {                                                                     \
+            return str;                                                       \
+        }                                                                     \
+        return std::string(match[1]) + ';';                                   \
+    }
+
+#define GET_TEMPLATE_INFO_IMPL                                  \
+    std::optional<std::string> GetTemplateInfo() const override \
+    {                                                           \
+        std::string str = __PRETTY_FUNCTION__;                  \
+        static std::regex template_expr{"\\[(.*)\\]"};          \
+        std::smatch match;                                      \
+        if(!std::regex_search(str, match, template_expr))       \
+        {                                                       \
+            return std::nullopt;                                \
+        }                                                       \
+        return std::string(match[1]);                           \
+    }
+
+#define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL
+
 struct BaseArgument
 {
     BaseArgument()                    = default;
@@ -48,6 +78,10 @@ struct BaseOperator
 
     virtual std::string GetTypeIdName() const { return typeid(*this).name(); }
 
+    virtual std::optional<std::string> GetObjectName() const { return std::nullopt; }
+
+    virtual std::optional<std::string> GetTemplateInfo() const { return std::nullopt; }
+
     virtual std::string GetTypeIdHashCode() const
     {
         std::ostringstream oss;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 4489b2e5c..ad6aa1e7c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -729,6 +729,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
 
         return str.str();
     }
+    REGISTER_EXTRA_PRINTING_METHODS
 };
 
 } // namespace device
diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
index 8b57611f0..1984ac645 100644
--- a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
@@ -3,610 +3,815 @@
 #endif
 
 #if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
-#   define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
+#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
 
-#   define _UK_PK_CVT_(x0_, x1_, y_)                           \
-    "  v_cmp_u_f32   s[36:37], " x0_ ", " x0_ "          \n"   \
-    "  v_add3_u32    v50, " x0_ ", %[v_nan_lo], 1        \n"   \
-    "  v_cndmask_b32  v54, v50, %[v_nan_hi], s[36:37]    \n"   \
-    "  v_cmp_u_f32   s[36:37], " x1_ ", " x1_ "          \n"   \
-    "  v_add3_u32    v50, " x1_ ", %[v_nan_lo], 1        \n"   \
-    "  v_cndmask_b32  v55, v50, %[v_nan_hi], s[36:37]    \n"   \
+#define _UK_PK_CVT_(x0_, x1_, y_)                            \
+    "  v_cmp_u_f32   s[36:37], " x0_ ", " x0_ "          \n" \
+    "  v_add3_u32    v50, " x0_ ", %[v_nan_lo], 1        \n" \
+    "  v_cndmask_b32  v54, v50, %[v_nan_hi], s[36:37]    \n" \
+    "  v_cmp_u_f32   s[36:37], " x1_ ", " x1_ "          \n" \
+    "  v_add3_u32    v50, " x1_ ", %[v_nan_lo], 1        \n" \
+    "  v_cndmask_b32  v55, v50, %[v_nan_hi], s[36:37]    \n" \
     "  v_perm_b32    " y_ ", v55, v54, s52               \n"
 
-#   define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
+#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
 
 #elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
 #define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
 
-#   define _UK_PK_CVT_(x0_, x1_, y_)                \
-    "  v_cvt_f16_f32  v54, " x0_ "  \n"             \
-    "  v_cvt_f16_f32  v55, " x1_ "  \n"             \
+#define _UK_PK_CVT_(x0_, x1_, y_)       \
+    "  v_cvt_f16_f32  v54, " x0_ "  \n" \
+    "  v_cvt_f16_f32  v55, " x1_ "  \n" \
     "  v_pack_b32_f16 " y_ ", v54, v55  \n"
 
-#   define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
+#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
 
 #endif
 
-
 ";-------------------------------------------------------------\n"
-" s_mov_b32 s52, 0x07060302 ; v_perm\n"
-" s_mov_b64 s[38:39], exec ; save current exec\n"
-" s_mov_b32 s8,    %[s_res_o0] \n"
-" s_mov_b32 s9,    %[s_res_o1] \n"
-" s_mov_b32 s12,    %[s_res_b0] \n"
-" s_mov_b32 s13,    %[s_res_b1] \n"
-" s_mov_b32 s14,    %[s_res_b2] \n"
-" s_mov_b32 s15,    %[s_res_b3] \n"
-" ds_read_b64   v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]                       \n"
-" ds_read_b64   v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]                     \n"
-" ds_read_b64   v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]                    \n"
-" ds_read_b64   v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]                    \n"
-" ds_read_b64   v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]                    \n"
-" ds_read_b64   v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]                    \n"
-" ds_read_b64   v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]                    \n"
-" ds_read_b64   v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]                    \n"
-" ds_read_b64   v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]                    \n"
-" ds_read_b64   v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]                    \n"
-" ds_read_b64   v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]                    \n"
-" ds_read_b64   v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]                    \n"
-" ds_read_b64   v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]                    \n"
-" ds_read_b64   v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]                    \n"
-" ds_read_b64   v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]                    \n"
-" ds_read_b64   v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]                    \n"
-" ds_read_b64   v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]                    \n"
-" ds_read_b64   v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]                    \n"
-" ds_read_b64   v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]                    \n"
-" ds_read_b64   v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]                    \n"
-" ds_read_b64   v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]                    \n"
-" ds_read_b64   v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]                    \n"
-" ds_read_b64   v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]                    \n"
-" ds_read_b64   v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]                    \n"
-" ds_read_b64   v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]                    \n"
-" ds_read_b64   v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]                    \n"
-" ds_read_b64   v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]                    \n"
-" ds_read_b64   v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]                    \n"
-" ds_read_b64   v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]                    \n"
-" ds_read_b64   v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]                    \n"
-" ds_read_b64   v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]                    \n"
-" ds_read_b64   v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]                    \n"
-" ds_read_b64   v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]                    \n"
-" ds_read_b64   v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]                    \n"
-" ds_read_b64   v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]                    \n"
-" ds_read_b64   v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]                    \n"
-" ds_read_b64   v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]                    \n"
-" ds_read_b64   v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]                    \n"
-" ds_read_b64   v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]                    \n"
-" ds_read_b64   v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]                    \n"
-" ds_read_b64   v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]                    \n"
-" ds_read_b64   v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]                    \n"
-" ds_read_b64   v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]                    \n"
-" ds_read_b64   v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]                    \n"
-" ds_read_b64   v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]                    \n"
-" ds_read_b64   v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]                    \n"
-" ds_read_b64   v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]                    \n"
-" ds_read_b64   v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]                    \n"
-" ds_read_b64   v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]                    \n"
-" ds_read_b64   v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]                    \n"
-" ds_read_b64   v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]                    \n"
-" ds_read_b64   v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]                    \n"
-" ds_read_b64   v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]                    \n"
-" ds_read_b64   v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]                    \n"
-" ds_read_b64   v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]                    \n"
-" ds_read_b64   v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]                    \n"
-" ds_read_b64   v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]                    \n"
-" ds_read_b64   v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]                    \n"
-" ds_read_b64   v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]                    \n"
-" ds_read_b64   v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]                    \n"
-" ds_read_b64   v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]                    \n"
-" ds_read_b64   v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]                    \n"
-" ds_read_b64   v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]                    \n"
-" ds_read_b64   v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]                    \n"
-"  s_waitcnt 0                    \n"
-"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
-"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
-"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
-"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
-"  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
-"  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
-"  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
-"  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
-"  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
-"  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
-"  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
-"  s_add_u32     s12, s86, s12                                  \n"
-"  s_addc_u32    s13, 0, s13                                    \n"
-"  s_waitcnt 0                    \n"
-"L_start%=:                    \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-"  s_barrier                                             \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
-"  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n"
-"  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
-"  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n"
-"  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
-"  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n"
-"  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
-"  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n"
-"  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n"
-"  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
-"  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
-"  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n"
-"  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n"
-"  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n"
-"  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n"
-"  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n"
-"  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n"
-"  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n"
-"  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n"
-"  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n"
-"  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n"
-"  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
-"  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
-"  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n"
-"  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n"
-"  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
-"  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
-_UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]\n"
-"  v_mul_f32     %[c0], %[scale_0], %[c0]                            \n"
-"  v_mul_f32     %[c1], %[scale_0], %[c1]                            \n"
-"  v_mul_f32     %[c2], %[scale_0], %[c2]                            \n"
-"  v_mul_f32     %[c3], %[scale_0], %[c3]                            \n"
-"  v_mul_f32     %[c4], %[scale_1], %[c4]                            \n"
-"  v_mul_f32     %[c5], %[scale_1], %[c5]                            \n"
-"  v_mul_f32     %[c6], %[scale_1], %[c6]                            \n"
-"  v_mul_f32     %[c7], %[scale_1], %[c7]                            \n"
-"  v_mul_f32     %[c8], %[scale_0], %[c8]                            \n"
-"  v_mul_f32     %[c9], %[scale_0], %[c9]                            \n"
-"  v_mul_f32     %[c10], %[scale_0], %[c10]                            \n"
-"  v_mul_f32     %[c11], %[scale_0], %[c11]                            \n"
-"  v_mul_f32     %[c12], %[scale_1], %[c12]                            \n"
-"  v_mul_f32     %[c13], %[scale_1], %[c13]                            \n"
-"  v_mul_f32     %[c14], %[scale_1], %[c14]                            \n"
-"  v_mul_f32     %[c15], %[scale_1], %[c15]                            \n"
-_UK_PK_CVT_("%[c0]",  "%[c1]",  "%[c0]")
-_UK_PK_CVT_("%[c2]",  "%[c3]",  "%[c1]")
-_UK_PK_CVT_("%[c4]",  "%[c5]",  "%[c2]")
-_UK_PK_CVT_("%[c6]",  "%[c7]",  "%[c3]")
-_UK_PK_CVT_("%[c8]",  "%[c9]",  "%[c4]")
-_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
-_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
-_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]")
-"  ;------------------------------  \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c0],%[c1]] offset:0    + %[shfl_base]               \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base]               \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base]               \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base]               \n"
-"  s_waitcnt     lgkmcnt(0)                              \n"
-"  s_barrier                                             \n"
-"  ds_read_b32   %[c0], %[v_sfl_sld] offset:0    + %[shfl_base]                    \n"
-"  ds_read_b32   %[c1], %[v_sfl_sld] offset:32   + %[shfl_base]                    \n"
-"  ds_read_b32   %[c2], %[v_sfl_sld] offset:64   + %[shfl_base]                    \n"
-"  ds_read_b32   %[c3], %[v_sfl_sld] offset:96   + %[shfl_base]                    \n"
-"  ds_read_b32   %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base]                    \n"
-"  ds_read_b32   %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base]                    \n"
-"  ds_read_b32   %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base]                    \n"
-"  ds_read_b32   %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base]                    \n"
-"  s_waitcnt     lgkmcnt(0)                              \n"
-"  s_mov_b64     exec, %[s_execflag_0]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o0], %[c0], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_1]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o1], %[c1], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_2]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o2], %[c2], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_3]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o3], %[c3], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_4]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o4], %[c4], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_5]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o5], %[c5], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_6]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o6], %[c6], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_7]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o7], %[c7], s[8:9]  \n"
-"  s_mov_b64     exec, s[38:39]                           \n"
-"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1     ; k--      \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
-"  s_cbranch_scc0 L_end%=                                       \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
-"  s_add_u32     s12, s86, s12                                  \n"
-"  s_addc_u32    s13, 0, s13                                    \n"
-"  s_add_u32     s8, %[s_tile_os_o], s8                             \n"
-"  s_addc_u32    s9, 0, s9                               \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-"  s_barrier                                             \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n"
-"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]] \n"
-"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n"
-"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]] \n"
-"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n"
-"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]] \n"
-"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n"
-"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]] \n"
-"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]] \n"
-"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]] \n"
-"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]] \n"
-"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]] \n"
-"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]] \n"
-"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]] \n"
-"  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]] \n"
-"  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]] \n"
-"  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]] \n"
-"  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]] \n"
-"  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]] \n"
-"  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]] \n"
-"  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]] \n"
-"  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]] \n"
-"  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]] \n"
-"  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]] \n"
-"  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]] \n"
-"  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]] \n"
-"  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]] \n"
-_UK_MFMA_ "  [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]\n"
-"  v_mul_f32     %[c16], %[scale_0], %[c16]                            \n"
-"  v_mul_f32     %[c17], %[scale_0], %[c17]                            \n"
-"  v_mul_f32     %[c18], %[scale_0], %[c18]                            \n"
-"  v_mul_f32     %[c19], %[scale_0], %[c19]                            \n"
-"  v_mul_f32     %[c20], %[scale_1], %[c20]                            \n"
-"  v_mul_f32     %[c21], %[scale_1], %[c21]                            \n"
-"  v_mul_f32     %[c22], %[scale_1], %[c22]                            \n"
-"  v_mul_f32     %[c23], %[scale_1], %[c23]                            \n"
-"  v_mul_f32     %[c24], %[scale_0], %[c24]                            \n"
-"  v_mul_f32     %[c25], %[scale_0], %[c25]                            \n"
-"  v_mul_f32     %[c26], %[scale_0], %[c26]                            \n"
-"  v_mul_f32     %[c27], %[scale_0], %[c27]                            \n"
-"  v_mul_f32     %[c28], %[scale_1], %[c28]                            \n"
-"  v_mul_f32     %[c29], %[scale_1], %[c29]                            \n"
-"  v_mul_f32     %[c30], %[scale_1], %[c30]                            \n"
-"  v_mul_f32     %[c31], %[scale_1], %[c31]                            \n"
+    " s_mov_b32 s52, 0x07060302 ; v_perm\n"
+    " s_mov_b64 s[38:39], exec ; save current exec\n"
+    " s_mov_b32 s8,    %[s_res_o0] \n"
+    " s_mov_b32 s9,    %[s_res_o1] \n"
+    " s_mov_b32 s12,    %[s_res_b0] \n"
+    " s_mov_b32 s13,    %[s_res_b1] \n"
+    " s_mov_b32 s14,    %[s_res_b2] \n"
+    " s_mov_b32 s15,    %[s_res_b3] \n"
+    " ds_read_b64   v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]                       \n"
+    " ds_read_b64   v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]                     \n"
+    " ds_read_b64   v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]                    \n"
+    " ds_read_b64   v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]                    \n"
+    "  s_waitcnt 0                    \n"
+    "  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+    "  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+    "  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+    "  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+    "  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+    "  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+    "  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+    "  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
+    "  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+    "  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+    "  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+    "  s_add_u32     s12, s86, s12                                  \n"
+    "  s_addc_u32    s13, 0, s13                                    \n"
+    "  s_waitcnt 0                    \n"
+    "L_start%=:                    \n"
+    "  s_waitcnt     vmcnt(32)                               \n"
+    "  s_barrier                                             \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
+    "  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], "
+    "%[c3]] \n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], "
+    "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n"
+    "  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], "
+    "%[c3]] \n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], "
+    "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
+    "  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], "
+    "%[c7]] \n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], "
+    "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n"
+    "  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], "
+    "%[c7]] \n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], "
+    "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
+    "  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n"
+    "  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
+    "  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n"
+    "  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[30:31], "
+    "v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n"
+    "  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], "
+    "%[c3]] \n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], "
+    "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n"
+    "  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], "
+    "%[c3]] \n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], "
+    "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
+    "  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], "
+    "%[c7]] \n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], "
+    "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
+    "  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], "
+    "%[c7]] \n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], "
+    "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n"
+    "  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n"
+    "  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n"
+    "  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n"
+    "  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[62:63], "
+    "v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n"
+    "  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], "
+    "%[c3]] \n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], "
+    "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n"
+    "  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], "
+    "%[c3]] \n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], "
+    "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n"
+    "  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], "
+    "%[c7]] \n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], "
+    "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n"
+    "  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], "
+    "%[c7]] \n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], "
+    "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n"
+    "  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n"
+    "  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n"
+    "  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n"
+    "  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[94:95], "
+    "v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n"
+    "  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], "
+    "%[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n"
+    "  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_ "  [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], "
+    "%[c2], %[c3]] \n" _UK_MFMA_
+    "  [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] "
+    "\n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
+    "  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], "
+    "%[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
+    "  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_ "  [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], "
+    "%[c6], %[c7]] \n" _UK_MFMA_
+    "  [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] "
+    "\n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n"
+    "  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[12:15], 0 offen \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n"
+    "  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], "
+    "%[c10], %[c11]] \n" _UK_MFMA_
+    "  [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] "
+    "\n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], "
+    "%[c14], %[c15]] \n"
+    "  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], "
+    "%[c15]] \n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], "
+    "%[c13], %[c14], %[c15]] \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], "
+    "%[c15]] \n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], "
+    "%[c13], %[c14], %[c15]] \n"
+    "  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], "
+    "%[c15]] \n" _UK_MFMA_ "  [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], "
+    "%[c13], %[c14], %[c15]] \n" _UK_MFMA_
+    "  [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], "
+    "%[c15]]\n"
+    "  v_mul_f32     %[c0], %[scale_0], %[c0]                            \n"
+    "  v_mul_f32     %[c1], %[scale_0], %[c1]                            \n"
+    "  v_mul_f32     %[c2], %[scale_0], %[c2]                            \n"
+    "  v_mul_f32     %[c3], %[scale_0], %[c3]                            \n"
+    "  v_mul_f32     %[c4], %[scale_1], %[c4]                            \n"
+    "  v_mul_f32     %[c5], %[scale_1], %[c5]                            \n"
+    "  v_mul_f32     %[c6], %[scale_1], %[c6]                            \n"
+    "  v_mul_f32     %[c7], %[scale_1], %[c7]                            \n"
+    "  v_mul_f32     %[c8], %[scale_0], %[c8]                            \n"
+    "  v_mul_f32     %[c9], %[scale_0], %[c9]                            \n"
+    "  v_mul_f32     %[c10], %[scale_0], %[c10]                            \n"
+    "  v_mul_f32     %[c11], %[scale_0], %[c11]                            \n"
+    "  v_mul_f32     %[c12], %[scale_1], %[c12]                            \n"
+    "  v_mul_f32     %[c13], %[scale_1], %[c13]                            \n"
+    "  v_mul_f32     %[c14], %[scale_1], %[c14]                            \n"
+    "  v_mul_f32     %[c15], %[scale_1], %[c15]                            \n" _UK_PK_CVT_(
+        "%[c0]", "%[c1]", "%[c0]") _UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]")
+        _UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]") _UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]") _UK_PK_CVT_(
+            "%[c8]", "%[c9]", "%[c4]") _UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
+            _UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") _UK_PK_CVT_(
+                "%[c14]",
+                "%[c15]",
+                "%[c7]") "  ;------------------------------  \n"
+                         "  ds_write_b64  %[v_sfl_sst], [%[c0],%[c1]] offset:0    + %[shfl_base]   "
+                         "            \n"
+                         "  ds_write_b64  %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base]   "
+                         "            \n"
+                         "  ds_write_b64  %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base]   "
+                         "            \n"
+                         "  ds_write_b64  %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base]   "
+                         "            \n"
+                         "  s_waitcnt     lgkmcnt(0)                              \n"
+                         "  s_barrier                                             \n"
+                         "  ds_read_b32   %[c0], %[v_sfl_sld] offset:0    + %[shfl_base]           "
+                         "         \n"
+                         "  ds_read_b32   %[c1], %[v_sfl_sld] offset:32   + %[shfl_base]           "
+                         "         \n"
+                         "  ds_read_b32   %[c2], %[v_sfl_sld] offset:64   + %[shfl_base]           "
+                         "         \n"
+                         "  ds_read_b32   %[c3], %[v_sfl_sld] offset:96   + %[shfl_base]           "
+                         "         \n"
+                         "  ds_read_b32   %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base]           "
+                         "         \n"
+                         "  ds_read_b32   %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base]           "
+                         "         \n"
+                         "  ds_read_b32   %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base]           "
+                         "         \n"
+                         "  ds_read_b32   %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base]           "
+                         "         \n"
+                         "  s_waitcnt     lgkmcnt(0)                              \n"
+                         "  s_mov_b64     exec, %[s_execflag_0]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o0], %[c0], s[8:9]  \n"
+                         "  s_mov_b64     exec, %[s_execflag_1]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o1], %[c1], s[8:9]  \n"
+                         "  s_mov_b64     exec, %[s_execflag_2]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o2], %[c2], s[8:9]  \n"
+                         "  s_mov_b64     exec, %[s_execflag_3]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o3], %[c3], s[8:9]  \n"
+                         "  s_mov_b64     exec, %[s_execflag_4]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o4], %[c4], s[8:9]  \n"
+                         "  s_mov_b64     exec, %[s_execflag_5]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o5], %[c5], s[8:9]  \n"
+                         "  s_mov_b64     exec, %[s_execflag_6]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o6], %[c6], s[8:9]  \n"
+                         "  s_mov_b64     exec, %[s_execflag_7]                    "
+                         "\n" _UK_ATOMIC_ADD_ "   %[v_os_o7], %[c7], s[8:9]  \n"
+                         "  s_mov_b64     exec, s[38:39]                           \n"
+                         "  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1     ; k--      \n"
+                         "  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+                         "  s_cbranch_scc0 L_end%=                                       \n"
+                         "  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+                         "  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+                         "  s_add_u32     s12, s86, s12                                  \n"
+                         "  s_addc_u32    s13, 0, s13                                    \n"
+                         "  s_add_u32     s8, %[s_tile_os_o], s8                             \n"
+                         "  s_addc_u32    s9, 0, s9                               \n"
+                         "  s_waitcnt     vmcnt(32)                               \n"
+                         "  s_barrier                                             \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n"
+                         "  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[130:131], "
+                         "v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n"
+                         "  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n"
+                         "  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n"
+                         "  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n"
+                         "  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[146:147], "
+                         "v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n"
+                         "  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n"
+                         "  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n"
+                         "  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[162:163], "
+                         "v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n"
+                         "  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n"
+                         "  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n"
+                         "  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n"
+                         "  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[178:179], "
+                         "v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n"
+                         "  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n"
+                         "  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[194:195], "
+                         "v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n"
+                         "  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n"
+                         "  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n"
+                         "  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n"
+                         "  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[210:211], "
+                         "v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n"
+                         "  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n"
+                         "  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c16],%[c17],%[c18],%[c19]], acc[226:227], "
+                         "v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n"
+                         "  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], "
+                         "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n"
+                         "  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n"
+                         "  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], "
+                         "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n"
+                         "  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[12:15], 0 offen "
+                         "\n" _UK_MFMA_ "  [%[c24],%[c25],%[c26],%[c27]], acc[242:243], "
+                         "v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n"
+                         "  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[12:15], 0 offen "
+                         "offset:1024 \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], "
+                         "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[12:15], 0 offen "
+                         "offset:2048 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n"
+                         "  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[12:15], 0 offen "
+                         "offset:3072 \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], "
+                         "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
+                         "  [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], "
+                         "[%[c28],%[c29],%[c30],%[c31]]\n"
+                         "  v_mul_f32     %[c16], %[scale_0], %[c16]                            \n"
+                         "  v_mul_f32     %[c17], %[scale_0], %[c17]                            \n"
+                         "  v_mul_f32     %[c18], %[scale_0], %[c18]                            \n"
+                         "  v_mul_f32     %[c19], %[scale_0], %[c19]                            \n"
+                         "  v_mul_f32     %[c20], %[scale_1], %[c20]                            \n"
+                         "  v_mul_f32     %[c21], %[scale_1], %[c21]                            \n"
+                         "  v_mul_f32     %[c22], %[scale_1], %[c22]                            \n"
+                         "  v_mul_f32     %[c23], %[scale_1], %[c23]                            \n"
+                         "  v_mul_f32     %[c24], %[scale_0], %[c24]                            \n"
+                         "  v_mul_f32     %[c25], %[scale_0], %[c25]                            \n"
+                         "  v_mul_f32     %[c26], %[scale_0], %[c26]                            \n"
+                         "  v_mul_f32     %[c27], %[scale_0], %[c27]                            \n"
+                         "  v_mul_f32     %[c28], %[scale_1], %[c28]                            \n"
+                         "  v_mul_f32     %[c29], %[scale_1], %[c29]                            \n"
+                         "  v_mul_f32     %[c30], %[scale_1], %[c30]                            \n"
+                         "  v_mul_f32     %[c31], %[scale_1], %[c31]                            \n"
 
-_UK_PK_CVT_("%[c16]",  "%[c17]",  "%[c16]")
-_UK_PK_CVT_("%[c18]",  "%[c19]",  "%[c17]")
-_UK_PK_CVT_("%[c20]",  "%[c21]",  "%[c18]")
-_UK_PK_CVT_("%[c22]",  "%[c23]",  "%[c19]")
-_UK_PK_CVT_("%[c24]",  "%[c25]",  "%[c20]")
-_UK_PK_CVT_("%[c26]",  "%[c27]",  "%[c21]")
-_UK_PK_CVT_("%[c28]",  "%[c29]",  "%[c22]")
-_UK_PK_CVT_("%[c30]",  "%[c31]",  "%[c23]")
+    _UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]") _UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]") _UK_PK_CVT_(
+        "%[c20]", "%[c21]", "%[c18]") _UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]")
+        _UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]") _UK_PK_CVT_(
+            "%[c26]", "%[c27]", "%[c21]") _UK_PK_CVT_("%[c28]",
+                                                      "%[c29]",
+                                                      "%[c22]") _UK_PK_CVT_("%[c30]",
+                                                                            "%[c31]",
+                                                                            "%[c23]")
 
-"  ;------------------------------  \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c16],%[c17]] offset:0    + %[shfl_base]         \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]         \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]         \n"
-"  ds_write_b64  %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]         \n"
-"  s_waitcnt     lgkmcnt(0)                              \n"
-"  s_barrier                                             \n"
-"  ds_read_b32   %[c16], %[v_sfl_sld] offset:0    + %[shfl_base]                  \n"
-"  ds_read_b32   %[c17], %[v_sfl_sld] offset:32   + %[shfl_base]                  \n"
-"  ds_read_b32   %[c18], %[v_sfl_sld] offset:64   + %[shfl_base]                  \n"
-"  ds_read_b32   %[c19], %[v_sfl_sld] offset:96   + %[shfl_base]                  \n"
-"  ds_read_b32   %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]                  \n"
-"  ds_read_b32   %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]                  \n"
-"  ds_read_b32   %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]                  \n"
-"  ds_read_b32   %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]                  \n"
-"  s_waitcnt     lgkmcnt(0)                              \n"
-"  s_mov_b64     exec, %[s_execflag_0]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o0], %[c16], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_1]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o1], %[c17], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_2]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o2], %[c18], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_3]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o3], %[c19], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_4]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o4], %[c20], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_5]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o5], %[c21], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_6]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o6], %[c22], s[8:9]  \n"
-"  s_mov_b64     exec, %[s_execflag_7]                    \n"
-_UK_ATOMIC_ADD_ "   %[v_os_o7], %[c23], s[8:9]  \n"
-"  s_mov_b64     exec, s[38:39]                           \n"
-"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1     ; k--      \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
-"  s_cbranch_scc0 L_end%=                                       \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
-"  s_add_u32     s12, s86, s12                                  \n"
-"  s_addc_u32    s13, 0, s13                                    \n"
-"  s_add_u32     s8, %[s_tile_os_o], s8                             \n"
-"  s_addc_u32    s9, 0, s9                               \n"
-"  s_branch      L_start%=          \n"
-"L_end%=:                                                \n"
+            "  ;------------------------------  \n"
+            "  ds_write_b64  %[v_sfl_sst], [%[c16],%[c17]] offset:0    + %[shfl_base]         \n"
+            "  ds_write_b64  %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]         \n"
+            "  ds_write_b64  %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]         \n"
+            "  ds_write_b64  %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]         \n"
+            "  s_waitcnt     lgkmcnt(0)                              \n"
+            "  s_barrier                                             \n"
+            "  ds_read_b32   %[c16], %[v_sfl_sld] offset:0    + %[shfl_base]                  \n"
+            "  ds_read_b32   %[c17], %[v_sfl_sld] offset:32   + %[shfl_base]                  \n"
+            "  ds_read_b32   %[c18], %[v_sfl_sld] offset:64   + %[shfl_base]                  \n"
+            "  ds_read_b32   %[c19], %[v_sfl_sld] offset:96   + %[shfl_base]                  \n"
+            "  ds_read_b32   %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]                  \n"
+            "  ds_read_b32   %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]                  \n"
+            "  ds_read_b32   %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]                  \n"
+            "  ds_read_b32   %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]                  \n"
+            "  s_waitcnt     lgkmcnt(0)                              \n"
+            "  s_mov_b64     exec, %[s_execflag_0]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o0], %[c16], s[8:9]  \n"
+            "  s_mov_b64     exec, %[s_execflag_1]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o1], %[c17], s[8:9]  \n"
+            "  s_mov_b64     exec, %[s_execflag_2]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o2], %[c18], s[8:9]  \n"
+            "  s_mov_b64     exec, %[s_execflag_3]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o3], %[c19], s[8:9]  \n"
+            "  s_mov_b64     exec, %[s_execflag_4]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o4], %[c20], s[8:9]  \n"
+            "  s_mov_b64     exec, %[s_execflag_5]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o5], %[c21], s[8:9]  \n"
+            "  s_mov_b64     exec, %[s_execflag_6]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o6], %[c22], s[8:9]  \n"
+            "  s_mov_b64     exec, %[s_execflag_7]                    \n" _UK_ATOMIC_ADD_
+            "   %[v_os_o7], %[c23], s[8:9]  \n"
+            "  s_mov_b64     exec, s[38:39]                           \n"
+            "  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1     ; k--      \n"
+            "  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+            "  s_cbranch_scc0 L_end%=                                       \n"
+            "  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+            "  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+            "  s_add_u32     s12, s86, s12                                  \n"
+            "  s_addc_u32    s13, 0, s13                                    \n"
+            "  s_add_u32     s8, %[s_tile_os_o], s8                             \n"
+            "  s_addc_u32    s9, 0, s9                               \n"
+            "  s_branch      L_start%=          \n"
+            "L_end%=:                                                \n"
 
 #undef _UK_MFMA_
 #undef _UK_PK_CVT_
diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
index b8c6d2002..512dda25d 100644
--- a/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
@@ -3,706 +3,767 @@
 #endif
 
 #if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
-# define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
+#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
 
-# define _UK_PK_CVT_(x0_, x1_, y_) \
- " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
- " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
- " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
- " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
- " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
- " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
- " v_perm_b32 " y_ ", v55, v54, s52 \n"
+#define _UK_PK_CVT_(x0_, x1_, y_)                       \
+    " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n"        \
+    " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n"       \
+    " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
+    " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n"        \
+    " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n"       \
+    " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
+    " v_perm_b32 " y_ ", v55, v54, s52 \n"
 
-# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
+#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
 
 #elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
 #define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
 
-# define _UK_PK_CVT_(x0_, x1_, y_) \
- " v_cvt_f16_f32 v54, " x0_ " \n" \
- " v_cvt_f16_f32 v55, " x1_ " \n" \
- " v_pack_b32_f16 " y_ ", v54, v55 \n"
+#define _UK_PK_CVT_(x0_, x1_, y_)    \
+    " v_cvt_f16_f32 v54, " x0_ " \n" \
+    " v_cvt_f16_f32 v55, " x1_ " \n" \
+    " v_pack_b32_f16 " y_ ", v54, v55 \n"
 
-# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
+#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
 
 #endif
 
-
 ";-------------------------------------------------------------\n"
-" s_mov_b32 s52, 0x07060302 ; v_perm\n"
-" s_mov_b64 s[38:39], exec ; save current exec\n"
-" s_mov_b32 s8, %[s_res_o0] \n"
-" s_mov_b32 s9, %[s_res_o1] \n"
-" s_mov_b32 s12, %[s_res_b0] \n"
-" s_mov_b32 s13, %[s_res_b1] \n"
-" s_mov_b32 s14, %[s_res_b2] \n"
-" s_mov_b32 s15, %[s_res_b3] \n" 
-" s_mov_b32 s59, 0 \n"
-" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
-" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
-" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
-" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
-" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
-" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
-" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
-" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
-" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
-" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
-" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
-" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
-" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
-" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
-" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
-" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
-" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
-" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
-" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
-" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
-" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
-" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
-" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
-" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
-" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
-" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
-" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
-" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
-" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
-" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
-" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
-" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
-" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
-" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
-" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
-" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
-" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
-" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
-" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
-" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
-" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
-" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
-" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
-" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
-" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
-" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
-" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
-" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
-" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
-" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
-" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
-" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
-" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
-" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
-" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
-" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
-" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
-" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
-" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
-" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
-" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
-" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
-" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
-" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n" 
-" s_waitcnt 0 \n"
-" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
-" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
-" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
-" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
-" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
-" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
-" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
-" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
-" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
-" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
-" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
-" s_add_u32 s12, %[s_tile_os_b], s12 \n" 
-" s_addc_u32 s13, 0, s13 \n" 
-" v_mov_b32 v64, 0 \n"
-" v_mov_b32 v80, 0 \n"
-" v_mov_b32 v65, 0 \n"
-" v_mov_b32 v81, 0 \n"
-" v_mov_b32 v66, 0 \n"
-" v_mov_b32 v82, 0 \n"
-" v_mov_b32 v67, 0 \n"
-" v_mov_b32 v83, 0 \n"
-" v_mov_b32 v68, 0 \n"
-" v_mov_b32 v84, 0 \n"
-" v_mov_b32 v69, 0 \n"
-" v_mov_b32 v85, 0 \n"
-" v_mov_b32 v70, 0 \n"
-" v_mov_b32 v86, 0 \n"
-" v_mov_b32 v71, 0 \n"
-" v_mov_b32 v87, 0 \n"
-" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168  \n"
-" s_mov_b32 s80, 0 \n" 
-" s_waitcnt vmcnt(24) \n" 
-"label_0AA6: \n"
-" s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
-" s_barrier \n" 
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0  \n"
-" ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
-" ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]  \n"
- " buffer_load_dwordx4 acc[128:131],  %[v_os_b0], s[12:15], 0 offen  \n" 
-" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]  \n"
-" ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
-" ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]  \n"
-" ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
-" ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]  \n"
- " buffer_load_dwordx4 acc[132:135],  %[v_os_b0], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]  \n"
-" ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
-" ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]  \n"
- " buffer_load_dwordx4 acc[136:139],  %[v_os_b0], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]  \n"
- " buffer_load_dwordx4 acc[140:143],  %[v_os_b0], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]  \n"
- " s_waitcnt lgkmcnt(0) \n"
- " s_mov_b64 exec, %[s_execflag_0] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]  \n"
- " buffer_load_dwordx4 acc[144:147],  %[v_os_b1], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]  \n"
- " buffer_load_dwordx4 acc[148:151],  %[v_os_b1], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]  \n"
- " buffer_load_dwordx4 acc[152:155],  %[v_os_b1], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]  \n"
- " buffer_load_dwordx4 acc[156:159],  %[v_os_b1], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]  \n"
- " s_mov_b64 exec, %[s_execflag_1] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]  \n"
-" s_waitcnt vmcnt(30)   \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]  \n"
- " buffer_load_dwordx4 acc[160:163],  %[v_os_b2], s[12:15], 0 offen  \n" 
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]  \n"
- " buffer_load_dwordx4 acc[164:167],  %[v_os_b2], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]  \n"
- " buffer_load_dwordx4 acc[168:171],  %[v_os_b2], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]  \n"
- " buffer_load_dwordx4 acc[172:175],  %[v_os_b2], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]  \n"
- " s_mov_b64 exec, %[s_execflag_2] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]  \n"
- " buffer_load_dwordx4 acc[176:179],  %[v_os_b3], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]  \n"
- " buffer_load_dwordx4 acc[180:183],  %[v_os_b3], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]  \n"
- " buffer_load_dwordx4 acc[184:187],  %[v_os_b3], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]  \n"
- " buffer_load_dwordx4 acc[188:191],  %[v_os_b3], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]  \n"
- " s_mov_b64 exec, %[s_execflag_3] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]    \n"
-" s_waitcnt vmcnt(30)   \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67]  \n"
- " buffer_load_dwordx4 acc[192:195],  %[v_os_b4], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67]  \n"
- " buffer_load_dwordx4 acc[196:199],  %[v_os_b4], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71]  \n"
- " buffer_load_dwordx4 acc[200:203],  %[v_os_b4], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71]  \n"
- " buffer_load_dwordx4 acc[204:207],  %[v_os_b4], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71]  \n"
- " s_mov_b64 exec, %[s_execflag_4] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75]  \n"
- " buffer_load_dwordx4 acc[208:211],  %[v_os_b5], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75]  \n"
- " buffer_load_dwordx4 acc[212:215],  %[v_os_b5], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79]  \n"
- " buffer_load_dwordx4 acc[216:219],  %[v_os_b5], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79]  \n"
- " buffer_load_dwordx4 acc[220:223],  %[v_os_b5], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79]  \n"
- " s_mov_b64 exec, %[s_execflag_5] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
-" s_waitcnt vmcnt(30)   \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67]  \n"
- " buffer_load_dwordx4 acc[224:227],  %[v_os_b6], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67]  \n"
- " buffer_load_dwordx4 acc[228:231],  %[v_os_b6], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67]  \n"
- _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71]  \n"
- " buffer_load_dwordx4 acc[232:235],  %[v_os_b6], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71]  \n"
- " buffer_load_dwordx4 acc[236:239],  %[v_os_b6], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71]  \n"
- _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71]  \n"
- " s_mov_b64 exec, %[s_execflag_6] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75]  \n"
- " buffer_load_dwordx4 acc[240:243],  %[v_os_b7], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75]  \n"
- " buffer_load_dwordx4 acc[244:247],  %[v_os_b7], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75]  \n"
- _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79]  \n"
- " buffer_load_dwordx4 acc[248:251],  %[v_os_b7], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79]  \n"
- " buffer_load_dwordx4 acc[252:255],  %[v_os_b7], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79]  \n"
- _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79]  \n"
- " s_mov_b64 exec, %[s_execflag_7] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
-" s_add_u32 s60, 0x00000100, s80  \n"
-" s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
-" s_cselect_b32 s56, %[s_tile_os_b], 0  \n"
-" s_add_u32 s12, s56, s12  \n"
-" s_addc_u32 s13, 0, s13  \n"
-" s_cmp_ge_u32 s80, 0x00000100  \n"
-" s_cselect_b32 s59, %[s_tile_os_o], s59  \n"
-" s_add_u32 s8, s59, s8  \n"
-" s_addc_u32 s9, 0, s9  \n"
-" v_mul_f32 %[c0], %[scale_0], %[c0] \n"
-" v_mul_f32 %[c1], %[scale_0], %[c1] \n"
-" v_mul_f32 %[c2], %[scale_0], %[c2] \n"
-" v_mul_f32 %[c3], %[scale_0], %[c3] \n"
-" v_mul_f32 %[c4], %[scale_1], %[c4] \n"
-" v_mul_f32 %[c5], %[scale_1], %[c5] \n"
-" v_mul_f32 %[c6], %[scale_1], %[c6] \n"
-" v_mul_f32 %[c7], %[scale_1], %[c7] \n"
-" v_mul_f32 %[c8], %[scale_0], %[c8] \n"
-" v_mul_f32 %[c9], %[scale_0], %[c9] \n"
-" v_mul_f32 %[c10], %[scale_0], %[c10] \n"
-" v_mul_f32 %[c11], %[scale_0], %[c11] \n"
-" v_mul_f32 %[c12], %[scale_1], %[c12] \n"
-" v_mul_f32 %[c13], %[scale_1], %[c13] \n"
-" v_mul_f32 %[c14], %[scale_1], %[c14] \n"
-" v_mul_f32 %[c15], %[scale_1], %[c15] \n"
-_UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]")
-_UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]")
-_UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]")
-_UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]")
-_UK_PK_CVT_("%[c8]", "%[c9]", "%[c4]")
-_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
-_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
-_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]")
-" s_addk_i32 s80, 0x0080  \n"
-" s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
-" s_cbranch_scc0 label_0EC1  \n"
-" s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
-" s_barrier  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0  \n"
-" ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
-" ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
-" ds_write_b64 v3, v[64:65] offset:16640  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]  \n"
- " buffer_load_dwordx4 acc[0:3],  %[v_os_b0], s[12:15], 0 offen  \n"
-" ds_write_b64 v3, v[66:67] offset:20992  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]  \n"
-" ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
-" ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
-" ds_write_b64 v3, v[68:69] offset:18816  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]  \n"
-" ds_write_b64 v3, v[70:71] offset:23168  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]  \n"
-" ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
-" ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]  \n"
- " buffer_load_dwordx4 acc[4:7],  %[v_os_b0], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]  \n"
-" ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
-" ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]  \n"
- " buffer_load_dwordx4 acc[8:11],  %[v_os_b0], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]  \n"
- " buffer_load_dwordx4 acc[12:15],  %[v_os_b0], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]  \n"
- " s_waitcnt lgkmcnt(0) \n" 
- " s_mov_b64 exec, %[s_execflag_0] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]  \n"
- " buffer_load_dwordx4 acc[16:19],  %[v_os_b1], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]  \n"
- " buffer_load_dwordx4 acc[20:23],  %[v_os_b1], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]  \n"
- " buffer_load_dwordx4 acc[24:27],  %[v_os_b1], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]  \n"
- " buffer_load_dwordx4 acc[28:31],  %[v_os_b1], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]  \n"
- " s_mov_b64 exec, %[s_execflag_1] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
-" s_waitcnt vmcnt(30) \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]  \n"
- " buffer_load_dwordx4 acc[32:35],  %[v_os_b2], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]  \n"
- " buffer_load_dwordx4 acc[36:39],  %[v_os_b2], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]  \n"
- " buffer_load_dwordx4 acc[40:43],  %[v_os_b2], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]  \n"
- " buffer_load_dwordx4 acc[44:47],  %[v_os_b2], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]  \n"
- " s_mov_b64 exec, %[s_execflag_2] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]  \n"
- " buffer_load_dwordx4 acc[48:51],  %[v_os_b3], s[12:15], 0 offen  \n" 
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]  \n"
- " buffer_load_dwordx4 acc[52:55],  %[v_os_b3], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]  \n"
- " buffer_load_dwordx4 acc[56:59],  %[v_os_b3], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]  \n"
- " buffer_load_dwordx4 acc[60:63],  %[v_os_b3], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]  \n"
- " s_mov_b64 exec, %[s_execflag_3] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
-" s_waitcnt vmcnt(30) \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83]  \n"
- " buffer_load_dwordx4 acc[64:67],  %[v_os_b4], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83]  \n"
- " buffer_load_dwordx4 acc[68:71],  %[v_os_b4], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87]  \n"
- " buffer_load_dwordx4 acc[72:75],  %[v_os_b4], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87]  \n"
- " buffer_load_dwordx4 acc[76:79],  %[v_os_b4], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87]  \n"
- " s_mov_b64 exec, %[s_execflag_4] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91]  \n"
- " buffer_load_dwordx4 acc[80:83],  %[v_os_b5], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91]  \n"
- " buffer_load_dwordx4 acc[84:87],  %[v_os_b5], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95]  \n"
- " buffer_load_dwordx4 acc[88:91],  %[v_os_b5], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95]  \n"
- " buffer_load_dwordx4 acc[92:95],  %[v_os_b5], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95]  \n"
- " s_mov_b64 exec, %[s_execflag_5] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
-" s_waitcnt vmcnt(30)  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83]  \n"
- " buffer_load_dwordx4 acc[96:99],  %[v_os_b6], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83]  \n"
- " buffer_load_dwordx4 acc[100:103],  %[v_os_b6], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], v[188:189], v[80:83]  \n"
- _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87]  \n"
- " buffer_load_dwordx4 acc[104:107],  %[v_os_b6], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], v[244:245], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87]  \n"
- " buffer_load_dwordx4 acc[108:111],  %[v_os_b6], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], v[252:253], v[84:87]  \n"
- _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87]  \n"
- " s_mov_b64 exec, %[s_execflag_6] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91]  \n"
- " buffer_load_dwordx4 acc[112:115],  %[v_os_b7], s[12:15], 0 offen  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91]  \n"
- " buffer_load_dwordx4 acc[116:119],  %[v_os_b7], s[12:15], 0 offen offset:1024  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], v[188:189], v[88:91]  \n"
- _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95]  \n"
- " buffer_load_dwordx4 acc[120:123],  %[v_os_b7], s[12:15], 0 offen offset:2048  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], v[244:245], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95]  \n"
- " buffer_load_dwordx4 acc[124:127],  %[v_os_b7], s[12:15], 0 offen offset:3072  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], v[252:253], v[92:95]  \n"
- _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95]  \n"
- " s_mov_b64 exec, %[s_execflag_7] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
-" s_add_u32 s60, 0x00000100, s80  \n"
-" s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
-" s_cselect_b32 s56, s56, 0  \n"
-" s_add_u32 s12, s56, s12  \n"
-" s_addc_u32 s13, 0, s13  \n"
-" s_cmp_ge_u32 s80, 0x00000100  \n"
-" s_cselect_b32 s59, 0x00000100, s59  \n"
-" s_add_u32 s8, s59, s8  \n"
-" s_addc_u32 s9, 0, s9  \n"
-" v_mul_f32 %[c16], %[scale_0], %[c16] \n"
-" v_mul_f32 %[c17], %[scale_0], %[c17] \n"
-" v_mul_f32 %[c18], %[scale_0], %[c18] \n"
-" v_mul_f32 %[c19], %[scale_0], %[c19] \n"
-" v_mul_f32 %[c20], %[scale_1], %[c20] \n"
-" v_mul_f32 %[c21], %[scale_1], %[c21] \n"
-" v_mul_f32 %[c22], %[scale_1], %[c22] \n"
-" v_mul_f32 %[c23], %[scale_1], %[c23] \n"
-" v_mul_f32 %[c24], %[scale_0], %[c24] \n"
-" v_mul_f32 %[c25], %[scale_0], %[c25] \n"
-" v_mul_f32 %[c26], %[scale_0], %[c26] \n"
-" v_mul_f32 %[c27], %[scale_0], %[c27] \n"
-" v_mul_f32 %[c28], %[scale_1], %[c28] \n"
-" v_mul_f32 %[c29], %[scale_1], %[c29] \n"
-" v_mul_f32 %[c30], %[scale_1], %[c30] \n"
-" v_mul_f32 %[c31], %[scale_1], %[c31] \n"
-_UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]")
-_UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]")
-_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]")
-_UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]")
-_UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]")
-_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]")
-_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]")
-_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]")
-" s_addk_i32 s80, 0x0080  \n"
-" s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
-" s_cbranch_scc0 label_0EC1  \n"
-" s_branch label_0AA6  \n"
-" label_0EC1: \n"
-" s_waitcnt lgkmcnt(0)  \n"
-" s_barrier  \n"
-" ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
-" ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
-" ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
-" ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
-" ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
-" ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n"
-" ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
-" ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n"
-" s_waitcnt lgkmcnt(0)  \n"
- " s_mov_b64 exec, %[s_execflag_0] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_1] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_2] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_3] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_4] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_5] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_6] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_7] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]                           \n"
-" s_add_u32 s8, s59, s8  \n"
-" s_addc_u32 s9, 0, s9  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520  \n"
-" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872  \n"
-" s_waitcnt lgkmcnt(0)  \n"
-" s_barrier  \n"
-" ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
-" ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
-" ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
-" ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
-" ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
-" ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n"
-" ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
-" ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n"
-" s_waitcnt lgkmcnt(0)  \n"
-" s_mov_b64 exec, %[s_execflag_0] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_1] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_2] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_3] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_4] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_5] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_6] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n" 
- " s_mov_b64 exec, %[s_execflag_7] \n"
-_UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n" 
-"  s_mov_b64     exec, s[38:39]  \n"
+    " s_mov_b32 s52, 0x07060302 ; v_perm\n"
+    " s_mov_b64 s[38:39], exec ; save current exec\n"
+    " s_mov_b32 s8, %[s_res_o0] \n"
+    " s_mov_b32 s9, %[s_res_o1] \n"
+    " s_mov_b32 s12, %[s_res_b0] \n"
+    " s_mov_b32 s13, %[s_res_b1] \n"
+    " s_mov_b32 s14, %[s_res_b2] \n"
+    " s_mov_b32 s15, %[s_res_b3] \n"
+    " s_mov_b32 s59, 0 \n"
+    " ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
+    " ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
+    " ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
+    " ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
+    " ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
+    " ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
+    " ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
+    " ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
+    " ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
+    " ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
+    " ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
+    " ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
+    " ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
+    " ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
+    " ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
+    " ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
+    " ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
+    " ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
+    " ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
+    " ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
+    " ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
+    " ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
+    " ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
+    " ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
+    " ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
+    " ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
+    " ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
+    " ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
+    " ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
+    " ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
+    " ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
+    " ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
+    " ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
+    " ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
+    " ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
+    " ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
+    " ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
+    " ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
+    " ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
+    " ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
+    " ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
+    " ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
+    " ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
+    " ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
+    " ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
+    " ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
+    " ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
+    " ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
+    " ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
+    " ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
+    " ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
+    " ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
+    " ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
+    " ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
+    " ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
+    " ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
+    " ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
+    " ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
+    " ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
+    " ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
+    " ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
+    " ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
+    " ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
+    " ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n"
+    " s_waitcnt 0 \n"
+    " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
+    " buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
+    " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
+    " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
+    " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
+    " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
+    " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
+    " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
+    " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
+    " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
+    " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
+    " s_add_u32 s12, %[s_tile_os_b], s12 \n"
+    " s_addc_u32 s13, 0, s13 \n"
+    " v_mov_b32 v64, 0 \n"
+    " v_mov_b32 v80, 0 \n"
+    " v_mov_b32 v65, 0 \n"
+    " v_mov_b32 v81, 0 \n"
+    " v_mov_b32 v66, 0 \n"
+    " v_mov_b32 v82, 0 \n"
+    " v_mov_b32 v67, 0 \n"
+    " v_mov_b32 v83, 0 \n"
+    " v_mov_b32 v68, 0 \n"
+    " v_mov_b32 v84, 0 \n"
+    " v_mov_b32 v69, 0 \n"
+    " v_mov_b32 v85, 0 \n"
+    " v_mov_b32 v70, 0 \n"
+    " v_mov_b32 v86, 0 \n"
+    " v_mov_b32 v71, 0 \n"
+    " v_mov_b32 v87, 0 \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640  \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992  \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816  \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168  \n"
+    " s_mov_b32 s80, 0 \n"
+    " s_waitcnt vmcnt(24) \n"
+    "label_0AA6: \n"
+    " s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
+    " s_barrier \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0  \n"
+    " ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
+    " ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[128:131],  %[v_os_b0], s[12:15], 0 offen  \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]  \n"
+    " ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
+    " ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]  \n"
+    " ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]  \n"
+    " ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
+    " ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[132:135],  %[v_os_b0], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]  \n"
+    " ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
+    " ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[136:139],  %[v_os_b0], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[140:143],  %[v_os_b0], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]  \n"
+    " s_waitcnt lgkmcnt(0) \n"
+    " s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[144:147],  %[v_os_b1], s[12:15], 0 offen  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[148:151],  %[v_os_b1], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[152:155],  %[v_os_b1], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[156:159],  %[v_os_b1], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]  \n"
+    " s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]  \n"
+    " s_waitcnt vmcnt(30)   \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[160:163],  %[v_os_b2], s[12:15], 0 offen  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[164:167],  %[v_os_b2], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[168:171],  %[v_os_b2], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[172:175],  %[v_os_b2], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]  \n"
+    " s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[176:179],  %[v_os_b3], s[12:15], 0 offen  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[180:183],  %[v_os_b3], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[184:187],  %[v_os_b3], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[188:191],  %[v_os_b3], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]  \n"
+    " s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]    \n"
+    " s_waitcnt vmcnt(30)   \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[192:195],  %[v_os_b4], s[12:15], 0 offen  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[196:199],  %[v_os_b4], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[200:203],  %[v_os_b4], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[204:207],  %[v_os_b4], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71]  \n"
+    " s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[208:211],  %[v_os_b5], s[12:15], 0 offen  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[212:215],  %[v_os_b5], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[216:219],  %[v_os_b5], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[220:223],  %[v_os_b5], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79]  \n"
+    " s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]                           \n"
+    " s_waitcnt vmcnt(30)   \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[224:227],  %[v_os_b6], s[12:15], 0 offen  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67]  \n"
+    " buffer_load_dwordx4 acc[228:231],  %[v_os_b6], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67]  \n" _UK_MFMA_
+    " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[232:235],  %[v_os_b6], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71]  \n"
+    " buffer_load_dwordx4 acc[236:239],  %[v_os_b6], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71]  \n" _UK_MFMA_
+    " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71]  \n"
+    " s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[240:243],  %[v_os_b7], s[12:15], 0 offen  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75]  \n"
+    " buffer_load_dwordx4 acc[244:247],  %[v_os_b7], s[12:15], 0 offen offset:1024  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75]  \n" _UK_MFMA_
+    " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[248:251],  %[v_os_b7], s[12:15], 0 offen offset:2048  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79]  \n"
+    " buffer_load_dwordx4 acc[252:255],  %[v_os_b7], s[12:15], 0 offen offset:3072  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79]  \n" _UK_MFMA_
+    " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79]  \n"
+    " s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n"
+    "  s_mov_b64     exec, s[38:39]                           \n"
+    " s_add_u32 s60, 0x00000100, s80  \n"
+    " s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
+    " s_cselect_b32 s56, %[s_tile_os_b], 0  \n"
+    " s_add_u32 s12, s56, s12  \n"
+    " s_addc_u32 s13, 0, s13  \n"
+    " s_cmp_ge_u32 s80, 0x00000100  \n"
+    " s_cselect_b32 s59, %[s_tile_os_o], s59  \n"
+    " s_add_u32 s8, s59, s8  \n"
+    " s_addc_u32 s9, 0, s9  \n"
+    " v_mul_f32 %[c0], %[scale_0], %[c0] \n"
+    " v_mul_f32 %[c1], %[scale_0], %[c1] \n"
+    " v_mul_f32 %[c2], %[scale_0], %[c2] \n"
+    " v_mul_f32 %[c3], %[scale_0], %[c3] \n"
+    " v_mul_f32 %[c4], %[scale_1], %[c4] \n"
+    " v_mul_f32 %[c5], %[scale_1], %[c5] \n"
+    " v_mul_f32 %[c6], %[scale_1], %[c6] \n"
+    " v_mul_f32 %[c7], %[scale_1], %[c7] \n"
+    " v_mul_f32 %[c8], %[scale_0], %[c8] \n"
+    " v_mul_f32 %[c9], %[scale_0], %[c9] \n"
+    " v_mul_f32 %[c10], %[scale_0], %[c10] \n"
+    " v_mul_f32 %[c11], %[scale_0], %[c11] \n"
+    " v_mul_f32 %[c12], %[scale_1], %[c12] \n"
+    " v_mul_f32 %[c13], %[scale_1], %[c13] \n"
+    " v_mul_f32 %[c14], %[scale_1], %[c14] \n"
+    " v_mul_f32 %[c15], %[scale_1], %[c15] \n" _UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]") _UK_PK_CVT_(
+        "%[c2]",
+        "%[c3]",
+        "%[c1]") _UK_PK_CVT_("%[c4]",
+                             "%[c5]",
+                             "%[c2]") _UK_PK_CVT_("%[c6]",
+                                                  "%[c7]",
+                                                  "%[c3]") _UK_PK_CVT_("%[c8]",
+                                                                       "%[c9]",
+                                                                       "%[c4]") _UK_PK_CVT_("%["
+                                                                                            "c10]",
+                                                                                            "%["
+                                                                                            "c11]",
+                                                                                            "%[c5]")
+        _UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") _UK_PK_CVT_(
+            "%[c14]",
+            "%[c15]",
+            "%[c7]") " s_addk_i32 s80, 0x0080  \n"
+                     " s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
+                     " s_cbranch_scc0 label_0EC1  \n"
+                     " s_waitcnt vmcnt(30) & lgkmcnt(0)  \n"
+                     " s_barrier  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0  \n"
+                     " ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
+                     " ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
+                     " ds_write_b64 v3, v[64:65] offset:16640  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[0:3],  %[v_os_b0], s[12:15], 0 offen  \n"
+                     " ds_write_b64 v3, v[66:67] offset:20992  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]  \n"
+                     " ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
+                     " ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
+                     " ds_write_b64 v3, v[68:69] offset:18816  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]  \n"
+                     " ds_write_b64 v3, v[70:71] offset:23168  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]  \n"
+                     " ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
+                     " ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[4:7],  %[v_os_b0], s[12:15], 0 offen offset:1024  "
+                     "\n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]  \n"
+                     " ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
+                     " ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83]  "
+                     "\n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[8:11],  %[v_os_b0], s[12:15], 0 offen offset:2048  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87]  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[12:15],  %[v_os_b0], s[12:15], 0 offen offset:3072  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]  \n"
+                     " s_waitcnt lgkmcnt(0) \n"
+                     " s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o0], v10, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[16:19],  %[v_os_b1], s[12:15], 0 offen  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91]  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91]  "
+                     "\n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[20:23],  %[v_os_b1], s[12:15], 0 offen offset:1024  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91]  "
+                     "\n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[24:27],  %[v_os_b1], s[12:15], 0 offen offset:2048  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95]  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[28:31],  %[v_os_b1], s[12:15], 0 offen offset:3072  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]  \n"
+                     " s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o1], v11, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n"
+                     " s_waitcnt vmcnt(30) \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83]  "
+                     "\n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[32:35],  %[v_os_b2], s[12:15], 0 offen  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83]  "
+                     "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], "
+                     "v[80:83]  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83]  "
+                     "\n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[36:39],  %[v_os_b2], s[12:15], 0 offen offset:1024  "
+                     "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], "
+                     "v[80:83]  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83]  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[40:43],  %[v_os_b2], s[12:15], 0 offen offset:2048  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87]  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[44:47],  %[v_os_b2], s[12:15], 0 offen offset:3072  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]  \n"
+                     " s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o2], v12, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91]  "
+                     "\n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[48:51],  %[v_os_b3], s[12:15], 0 offen  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91]  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91]  "
+                     "\n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[52:55],  %[v_os_b3], s[12:15], 0 offen offset:1024  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91]  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[56:59],  %[v_os_b3], s[12:15], 0 offen offset:2048  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95]  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[60:63],  %[v_os_b3], s[12:15], 0 offen offset:3072  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]  \n"
+                     " s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o3], v13, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n"
+                     " s_waitcnt vmcnt(30) \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83]  "
+                     "\n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[64:67],  %[v_os_b4], s[12:15], 0 offen  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83]  "
+                     "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], "
+                     "v[80:83]  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83]  "
+                     "\n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[68:71],  %[v_os_b4], s[12:15], 0 offen offset:1024  "
+                     "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], "
+                     "v[80:83]  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83]  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[72:75],  %[v_os_b4], s[12:15], 0 offen offset:2048  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87]  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[76:79],  %[v_os_b4], s[12:15], 0 offen offset:3072  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87]  \n"
+                     " s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o4], v14, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91]  "
+                     "\n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[80:83],  %[v_os_b5], s[12:15], 0 offen  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91]  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91]  "
+                     "\n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[84:87],  %[v_os_b5], s[12:15], 0 offen offset:1024  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91]  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[88:91],  %[v_os_b5], s[12:15], 0 offen offset:2048  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95]  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[92:95],  %[v_os_b5], s[12:15], 0 offen offset:3072  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95]  \n"
+                     " s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o5], v15, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n"
+                     " s_waitcnt vmcnt(30)  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83]  "
+                     "\n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[96:99],  %[v_os_b6], s[12:15], 0 offen  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83]  "
+                     "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], "
+                     "v[80:83]  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83]  "
+                     "\n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83]  \n"
+                     " buffer_load_dwordx4 acc[100:103],  %[v_os_b6], s[12:15], 0 offen "
+                     "offset:1024  \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], "
+                     "v[188:189], v[80:83]  \n" _UK_MFMA_
+                     " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83]  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[104:107],  %[v_os_b6], s[12:15], 0 offen "
+                     "offset:2048  \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], "
+                     "v[244:245], v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87]  "
+                     "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], "
+                     "v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87]  \n"
+                     " buffer_load_dwordx4 acc[108:111],  %[v_os_b6], s[12:15], 0 offen "
+                     "offset:3072  \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], "
+                     "v[252:253], v[84:87]  \n" _UK_MFMA_
+                     " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87]  \n"
+                     " s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o6], v16, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91]  "
+                     "\n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[112:115],  %[v_os_b7], s[12:15], 0 offen  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91]  "
+                     "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], "
+                     "v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91]  \n"
+                     " buffer_load_dwordx4 acc[116:119],  %[v_os_b7], s[12:15], 0 offen "
+                     "offset:1024  \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], "
+                     "v[188:189], v[88:91]  \n" _UK_MFMA_
+                     " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91]  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[120:123],  %[v_os_b7], s[12:15], 0 offen "
+                     "offset:2048  \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], "
+                     "v[244:245], v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95]  "
+                     "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], "
+                     "v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95]  \n"
+                     " buffer_load_dwordx4 acc[124:127],  %[v_os_b7], s[12:15], 0 offen "
+                     "offset:3072  \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], "
+                     "v[252:253], v[92:95]  \n" _UK_MFMA_
+                     " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95]  \n"
+                     " s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_
+                     " %[v_os_o7], v17, s[8:9] \n"
+                     "  s_mov_b64     exec, s[38:39]                           \n"
+                     " s_add_u32 s60, 0x00000100, s80  \n"
+                     " s_cmp_lt_u32 s60, %[s_loop_cnt]  \n"
+                     " s_cselect_b32 s56, s56, 0  \n"
+                     " s_add_u32 s12, s56, s12  \n"
+                     " s_addc_u32 s13, 0, s13  \n"
+                     " s_cmp_ge_u32 s80, 0x00000100  \n"
+                     " s_cselect_b32 s59, 0x00000100, s59  \n"
+                     " s_add_u32 s8, s59, s8  \n"
+                     " s_addc_u32 s9, 0, s9  \n"
+                     " v_mul_f32 %[c16], %[scale_0], %[c16] \n"
+                     " v_mul_f32 %[c17], %[scale_0], %[c17] \n"
+                     " v_mul_f32 %[c18], %[scale_0], %[c18] \n"
+                     " v_mul_f32 %[c19], %[scale_0], %[c19] \n"
+                     " v_mul_f32 %[c20], %[scale_1], %[c20] \n"
+                     " v_mul_f32 %[c21], %[scale_1], %[c21] \n"
+                     " v_mul_f32 %[c22], %[scale_1], %[c22] \n"
+                     " v_mul_f32 %[c23], %[scale_1], %[c23] \n"
+                     " v_mul_f32 %[c24], %[scale_0], %[c24] \n"
+                     " v_mul_f32 %[c25], %[scale_0], %[c25] \n"
+                     " v_mul_f32 %[c26], %[scale_0], %[c26] \n"
+                     " v_mul_f32 %[c27], %[scale_0], %[c27] \n"
+                     " v_mul_f32 %[c28], %[scale_1], %[c28] \n"
+                     " v_mul_f32 %[c29], %[scale_1], %[c29] \n"
+                     " v_mul_f32 %[c30], %[scale_1], %[c30] \n"
+                     " v_mul_f32 %[c31], %[scale_1], %[c31] \n" _UK_PK_CVT_(
+                         "%[c16]", "%[c17]", "%[c16]") _UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]")
+                         _UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]") _UK_PK_CVT_(
+                             "%[c22]", "%[c23]", "%[c19]") _UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]")
+                             _UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]")
+                                 _UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]") _UK_PK_CVT_(
+                                     "%[c30]",
+                                     "%[c31]",
+                                     "%[c23]") " s_addk_i32 s80, 0x0080  \n"
+                                               " s_cmp_lt_i32 s80, %[s_loop_cnt]  \n"
+                                               " s_cbranch_scc0 label_0EC1  \n"
+                                               " s_branch label_0AA6  \n"
+                                               " label_0EC1: \n"
+                                               " s_waitcnt lgkmcnt(0)  \n"
+                                               " s_barrier  \n"
+                                               " ds_read_b32 v10, %[v_sfl_sld] offset:16640  \n"
+                                               " ds_read_b32 v11, %[v_sfl_sld] offset:16672  \n"
+                                               " ds_read_b32 v12, %[v_sfl_sld] offset:16704  \n"
+                                               " ds_read_b32 v13, %[v_sfl_sld] offset:16736  \n"
+                                               " ds_read_b32 v14, %[v_sfl_sld] offset:20992  \n"
+                                               " ds_read_b32 v15, %[v_sfl_sld] offset:21024  \n"
+                                               " ds_read_b32 v16, %[v_sfl_sld] offset:21056  \n"
+                                               " ds_read_b32 v17, %[v_sfl_sld] offset:21088  \n"
+                                               " s_waitcnt lgkmcnt(0)  \n"
+                                               " s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o0], v10, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o1], v11, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o2], v12, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o3], v13, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o4], v14, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o5], v15, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o6], v16, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o7], v17, s[8:9] \n"
+                                               "  s_mov_b64     exec, s[38:39]                     "
+                                               "      \n"
+                                               " s_add_u32 s8, s59, s8  \n"
+                                               " s_addc_u32 s9, 0, s9  \n"
+                                               " ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] "
+                                               "offset:25344  \n"
+                                               " ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] "
+                                               "offset:29696  \n"
+                                               " ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] "
+                                               "offset:27520  \n"
+                                               " ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] "
+                                               "offset:31872  \n"
+                                               " s_waitcnt lgkmcnt(0)  \n"
+                                               " s_barrier  \n"
+                                               " ds_read_b32 v10, %[v_sfl_sld] offset:25344  \n"
+                                               " ds_read_b32 v11, %[v_sfl_sld] offset:25376  \n"
+                                               " ds_read_b32 v12, %[v_sfl_sld] offset:25408  \n"
+                                               " ds_read_b32 v13, %[v_sfl_sld] offset:25440  \n"
+                                               " ds_read_b32 v14, %[v_sfl_sld] offset:29696  \n"
+                                               " ds_read_b32 v15, %[v_sfl_sld] offset:29728  \n"
+                                               " ds_read_b32 v16, %[v_sfl_sld] offset:29760  \n"
+                                               " ds_read_b32 v17, %[v_sfl_sld] offset:29792  \n"
+                                               " s_waitcnt lgkmcnt(0)  \n"
+                                               " s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o0], v10, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o1], v11, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o2], v12, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o3], v13, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o4], v14, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o5], v15, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o6], v16, s[8:9] \n"
+                                               " s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_
+                                               " %[v_os_o7], v17, s[8:9] \n"
+                                               "  s_mov_b64     exec, s[38:39]  \n"
 
-#undef _UK_MFMA_ 
+#undef _UK_MFMA_
 #undef _UK_PK_CVT_
 #undef _UK_ATOMIC_ADD_
-
diff --git a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
index a34a21d39..a6dd83f05 100644
--- a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
@@ -9,508 +9,509 @@
 #endif
 
 "s_mov_b32 s16,    %[s_res_a0] \n"
-"s_mov_b32 s17,    %[s_res_a1] \n"
-"s_mov_b32 s18,    %[s_res_a2] \n"
-"s_mov_b32 s19,    %[s_res_a3] \n"
-"s_mov_b32 s20,    %[s_res_b0] \n"
-"s_mov_b32 s21,    %[s_res_b1] \n"
-"s_mov_b32 s22,    %[s_res_b2] \n"
-"s_mov_b32 s23,    %[s_res_b3] \n"
-// "s_nop  4\n"
-"; -- prefetch A0\n"
-"s_add_u32     m0, 0, %[s_m0_init]                        \n"
-"buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
-"s_add_u32 m0, %[smem_sz], %[s_m0_init]                       \n"
-"s_cmp_gt_i32  %[s_loop_cnt] 1             ; move a with cond \n"
-"s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
-"s_add_u32     s16, s86, s16               ; move a with cond \n"
-"s_addc_u32    s17, 0, s17                 ; move a with cond \n"
-"; -- prefetch A1\n"
-"buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
-"s_add_u32 m0, 0, %[s_m0_init]                                \n"
-"s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
-"s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
-"s_add_u32     s16, s86, s16               ; move a with cond \n"
-"s_addc_u32    s17, 0, s17                 ; move a with cond \n"
-"; -- prefetch B0\n"
-"buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
-"buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen  \n"
-"buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen  \n"
-"buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen  \n"
-"buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024    \n"
-"buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048    \n"
-"buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072    \n"
-"buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen                \n"
-"buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024    \n"
-"buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048    \n"
-"buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072    \n"
-"buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen                \n"
-"buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024    \n"
-"buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048    \n"
-"buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072    \n"
-"buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen                \n"
-"buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen              \n"
-"buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072  \n"
-"s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"s_cselect_b32 s86, %[s_tile_os_b], 0      ; move b with cond \n"
-"s_add_u32     s20, s86, s20               ; move b with cond \n"
-"s_addc_u32    s21, 0, s21                 ; move b with cond \n"
-"s_waitcnt     vmcnt(40)                        \n"
-"s_barrier                                      \n"
-"ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n"    // 1024: N stride, 64 K stride
-"ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
-"ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
-"ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
-"ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
-"ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
-"ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
-"ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
-"L_start%=:                                                         \n"
-"  s_waitcnt     vmcnt(24) & lgkmcnt(0)                             \n"
-"  s_barrier                                                        \n"
-_UK_MFMA_ "  %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[smem_sz], %[s_m0_init]                  \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4] \n"
-"  ds_read_b128  v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]                \n"
-_UK_MFMA_ "  %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4] \n"
-"  ds_read_b128  v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]              \n"
-_UK_MFMA_ "  %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5] \n"
-"  ds_read_b128  v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]              \n"
-_UK_MFMA_ "  %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5] \n"
-"  ds_read_b128  v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]              \n"
-_UK_MFMA_ "  %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6] \n"
-"  ds_read_b128  v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]              \n"
-_UK_MFMA_ "  %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6] \n"
-"  ds_read_b128  v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]              \n"
-_UK_MFMA_ "  %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7] \n"
-"  ds_read_b128  v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]              \n"
-_UK_MFMA_ "  %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7] \n"
-"  ds_read_b128  v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]              \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072\n"
-_UK_MFMA_ "  %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15] \n"
-"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
-"  s_cbranch_scc0 L_end%=                                       \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
-"  s_add_u32     s16, s86, s16                                  \n"
-"  s_addc_u32    s17, 0, s17                                    \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
-"  s_add_u32     s20, s86, s20                                  \n"
-"  s_addc_u32    s21, 0, s21                                    \n"
-"  ;------------------------------------------                  \n"
-"  s_waitcnt     vmcnt(24) & lgkmcnt(0)                  \n"
-"  s_barrier                                             \n"
-_UK_MFMA_ "  %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, 0, %[s_m0_init]                  \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4] \n"
-"  ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]  \n"
-_UK_MFMA_ "  %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4] \n"
-"  ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]  \n"
-_UK_MFMA_ "  %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5] \n"
-"  ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]                 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5] \n"
-"  ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]                \n"
-_UK_MFMA_ "  %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6] \n"
-"  ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]               \n"
-_UK_MFMA_ "  %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6] \n"
-"  ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]            \n"
-_UK_MFMA_ "  %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7] \n"
-"  ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]                \n"
-_UK_MFMA_ "  %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7] \n"
-"  ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]           \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15] \n"
-"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
-"  s_cbranch_scc0 L_end%=                                       \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
-"  s_add_u32     s16, s86, s16                                  \n"
-"  s_addc_u32    s17, 0, s17                                    \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
-"  s_add_u32     s20, s86, s20                                  \n"
-"  s_addc_u32    s21, 0, s21                                    \n"
-"  s_branch     L_start%=                                       \n"
-"L_end%=:                                                       \n"
-"  s_nop 2                                                      \n"
+    "s_mov_b32 s17,    %[s_res_a1] \n"
+    "s_mov_b32 s18,    %[s_res_a2] \n"
+    "s_mov_b32 s19,    %[s_res_a3] \n"
+    "s_mov_b32 s20,    %[s_res_b0] \n"
+    "s_mov_b32 s21,    %[s_res_b1] \n"
+    "s_mov_b32 s22,    %[s_res_b2] \n"
+    "s_mov_b32 s23,    %[s_res_b3] \n"
+    // "s_nop  4\n"
+    "; -- prefetch A0\n"
+    "s_add_u32     m0, 0, %[s_m0_init]                        \n"
+    "buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
+    "s_add_u32 m0, %[smem_sz], %[s_m0_init]                       \n"
+    "s_cmp_gt_i32  %[s_loop_cnt] 1             ; move a with cond \n"
+    "s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
+    "s_add_u32     s16, s86, s16               ; move a with cond \n"
+    "s_addc_u32    s17, 0, s17                 ; move a with cond \n"
+    "; -- prefetch A1\n"
+    "buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
+    "s_add_u32 m0, 0, %[s_m0_init]                                \n"
+    "s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+    "s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
+    "s_add_u32     s16, s86, s16               ; move a with cond \n"
+    "s_addc_u32    s17, 0, s17                 ; move a with cond \n"
+    "; -- prefetch B0\n"
+    "buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
+    "buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen  \n"
+    "buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen  \n"
+    "buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen  \n"
+    "buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024    \n"
+    "buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048    \n"
+    "buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072    \n"
+    "buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen                \n"
+    "buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024    \n"
+    "buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048    \n"
+    "buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072    \n"
+    "buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen                \n"
+    "buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024    \n"
+    "buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048    \n"
+    "buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072    \n"
+    "buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen                \n"
+    "buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen              \n"
+    "buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072  \n"
+    "s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+    "s_cselect_b32 s86, %[s_tile_os_b], 0      ; move b with cond \n"
+    "s_add_u32     s20, s86, s20               ; move b with cond \n"
+    "s_addc_u32    s21, 0, s21                 ; move b with cond \n"
+    "s_waitcnt     vmcnt(40)                        \n"
+    "s_barrier                                      \n"
+    "ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64
+                                                                               // K stride
+    "ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
+    "ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
+    "ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
+    "ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
+    "ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
+    "ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
+    "ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
+    "L_start%=:                                                         \n"
+    "  s_waitcnt     vmcnt(24) & lgkmcnt(0)                             \n"
+    "  s_barrier                                                        \n" _UK_MFMA_
+    "  %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[smem_sz], %[s_m0_init]                  \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4] \n"
+    "  ds_read_b128  v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]                "
+    "\n" _UK_MFMA_ "  %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4] \n"
+    "  ds_read_b128  v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]              "
+    "\n" _UK_MFMA_ "  %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5] \n"
+    "  ds_read_b128  v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]              "
+    "\n" _UK_MFMA_ "  %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5] \n"
+    "  ds_read_b128  v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]              "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6] \n"
+    "  ds_read_b128  v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]              "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6] \n"
+    "  ds_read_b128  v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]              "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7] \n"
+    "  ds_read_b128  v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]              "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7] \n"
+    "  ds_read_b128  v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]              \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11] \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072\n" _UK_MFMA_
+    "  %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15] \n"
+    "  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+    "  s_cbranch_scc0 L_end%=                                       \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
+    "  s_add_u32     s16, s86, s16                                  \n"
+    "  s_addc_u32    s17, 0, s17                                    \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+    "  s_add_u32     s20, s86, s20                                  \n"
+    "  s_addc_u32    s21, 0, s21                                    \n"
+    "  ;------------------------------------------                  \n"
+    "  s_waitcnt     vmcnt(24) & lgkmcnt(0)                  \n"
+    "  s_barrier                                             \n" _UK_MFMA_
+    "  %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, 0, %[s_m0_init]                  \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4] \n"
+    "  ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]  \n" _UK_MFMA_
+    "  %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4] \n"
+    "  ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]  \n" _UK_MFMA_
+    "  %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5] \n"
+    "  ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]                 "
+    "\n" _UK_MFMA_ "  %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5] \n"
+    "  ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]                "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6] \n"
+    "  ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]               "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6] \n"
+    "  ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]            "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7] \n"
+    "  ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]                "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7] \n"
+    "  ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]           \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11] \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15] \n"
+    "  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+    "  s_cbranch_scc0 L_end%=                                       \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
+    "  s_add_u32     s16, s86, s16                                  \n"
+    "  s_addc_u32    s17, 0, s17                                    \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+    "  s_add_u32     s20, s86, s20                                  \n"
+    "  s_addc_u32    s21, 0, s21                                    \n"
+    "  s_branch     L_start%=                                       \n"
+    "L_end%=:                                                       \n"
+    "  s_nop 2                                                      \n"
 
 #undef _UK_MFMA_
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index f6e1f12e2..30f0da212 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -144,6 +144,7 @@ bool profile_gemm_universal_impl(int do_verification,
     }
 
     std::string best_op_name;
+    std::optional<std::string> best_op_object_name;
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
@@ -225,7 +226,8 @@ bool profile_gemm_universal_impl(int do_verification,
                     }
                 }
 
-                std::string op_name = op_ptr->GetTypeString();
+                std::string op_name                    = op_ptr->GetTypeString();
+                std::optional<std::string> op_obj_name = op_ptr->GetObjectName();
 
                 float ave_time = invoker_ptr->Run(argument_ptr.get(),
                                                   StreamConfig{nullptr,
@@ -251,11 +253,12 @@ bool profile_gemm_universal_impl(int do_verification,
 
                 if(tflops > best_tflops && ave_time > 1e-10)
                 {
-                    best_op_name    = op_name;
-                    best_tflops     = tflops;
-                    best_ave_time   = ave_time;
-                    best_gb_per_sec = gb_per_sec;
-                    best_kbatch     = kbatch_curr;
+                    best_op_name        = op_name;
+                    best_op_object_name = op_obj_name;
+                    best_tflops         = tflops;
+                    best_ave_time       = ave_time;
+                    best_gb_per_sec     = gb_per_sec;
+                    best_kbatch         = kbatch_curr;
                 }
             }
             else
@@ -306,6 +309,9 @@ bool profile_gemm_universal_impl(int do_verification,
               << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
               << " GB/s, " << best_op_name << std::endl;
 
+    if(best_op_object_name)
+        std::cout << best_op_object_name.value() << std::endl;
+
     return pass;
 }
 
-- 
GitLab


From 627a27bda3f38b3d904f844ec0b4d988e50cc262 Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Tue, 17 Dec 2024 14:25:22 +0100
Subject: [PATCH 127/153] Added unit tests for CK Tile compute bound gemm
 pipeline (#1728)

---
 test/ck_tile/gemm/CMakeLists.txt              |  2 +-
 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp  | 36 -----------
 test/ck_tile/gemm/test_gemm_pipeline.cpp      | 42 +++++++++++++
 ...es.inc => test_gemm_pipeline_ut_cases.inc} | 10 +--
 ...e_util.hpp => test_gemm_pipeline_util.hpp} | 62 +++++++++++++------
 5 files changed, 90 insertions(+), 62 deletions(-)
 delete mode 100644 test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline.cpp
 rename test/ck_tile/gemm/{test_gemm_mem_pipeline_ut_cases.inc => test_gemm_pipeline_ut_cases.inc} (79%)
 rename test/ck_tile/gemm/{test_gemm_mem_pipeline_util.hpp => test_gemm_pipeline_util.hpp} (80%)

diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index f96ad9c6e..ecfbd4e55 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
 # Currently ck_tile is only built on gfx9
 if(GPU_TARGETS MATCHES "gfx9")
-    add_gtest_executable(test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline test_gemm_pipeline.cpp)
 endif()
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
deleted file mode 100644
index aeb383c87..000000000
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-
-#include "ck_tile/host.hpp"
-#include "test_gemm_mem_pipeline_util.hpp"
-
-using F16       = ck_tile::half_t;
-using F32       = float;
-using Row       = ck_tile::tensor_layout::gemm::RowMajor;
-using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
-using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
-                                             ck_tile::GemmPipelineScheduler::Intrawave>;
-using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
-                                             ck_tile::GemmPipelineScheduler::Interwave>;
-
-// clang-format off
-using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>
-    >;
-// clang-format on
-
-TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes);
-
-#include "test_gemm_mem_pipeline_ut_cases.inc"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline.cpp b/test/ck_tile/gemm/test_gemm_pipeline.cpp
new file mode 100644
index 000000000..48a2b86a6
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline.cpp
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_pipeline_util.hpp"
+
+using F16       = ck_tile::half_t;
+using F32       = float;
+using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
+                                             ck_tile::GemmPipelineScheduler::Intrawave>;
+using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
+                                             ck_tile::GemmPipelineScheduler::Interwave>;
+using Mem       = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
+using Comp      = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Comp>;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler, PipelineType
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGemmPipeline, KernelTypes);
+
+#include "test_gemm_pipeline_ut_cases.inc"
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
similarity index 79%
rename from test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
rename to test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
index af94d68f2..c78d69601 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
@@ -3,7 +3,7 @@
 
 #pragma once
 
-TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
+TYPED_TEST(TestCkTileGemmPipeline, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 1024;
@@ -13,7 +13,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
+TYPED_TEST(TestCkTileGemmPipeline, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 1024;
@@ -23,7 +23,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
+TYPED_TEST(TestCkTileGemmPipeline, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 1024;
@@ -33,7 +33,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
+TYPED_TEST(TestCkTileGemmPipeline, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 1024;
@@ -43,7 +43,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmMemPipeline, NotSupportedArgument)
+TYPED_TEST(TestCkTileGemmPipeline, NotSupportedArgument)
 {
     constexpr int M = 512;
     constexpr int N = 1025;
diff --git a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
similarity index 80%
rename from test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
rename to test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 6941a7596..a51498602 100644
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -11,18 +11,24 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
+enum struct GemmPipelineType
+{
+    Mem,
+    Comp
+};
 template <typename Tuple>
-class TestCkTileGemmMemPipeline : public ::testing::Test
+class TestCkTileGemmPipeline : public ::testing::Test
 {
     protected:
-    using ALayout                   = std::tuple_element_t<0, Tuple>;
-    using BLayout                   = std::tuple_element_t<1, Tuple>;
-    using CLayout                   = std::tuple_element_t<2, Tuple>;
-    using ADataType                 = std::tuple_element_t<3, Tuple>;
-    using BDataType                 = std::tuple_element_t<4, Tuple>;
-    using AccDataType               = std::tuple_element_t<5, Tuple>;
-    using CDataType                 = std::tuple_element_t<6, Tuple>;
-    static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value;
+    using ALayout                      = std::tuple_element_t<0, Tuple>;
+    using BLayout                      = std::tuple_element_t<1, Tuple>;
+    using CLayout                      = std::tuple_element_t<2, Tuple>;
+    using ADataType                    = std::tuple_element_t<3, Tuple>;
+    using BDataType                    = std::tuple_element_t<4, Tuple>;
+    using AccDataType                  = std::tuple_element_t<5, Tuple>;
+    using CDataType                    = std::tuple_element_t<6, Tuple>;
+    static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
+    static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
     // TODO: expose tile size through test t-param ?
 
     struct gemm_args
@@ -74,8 +80,13 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
 
         using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
 
-        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
-            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
+        using BaseGemmPipeline = std::conditional_t<
+            PipelineType == GemmPipelineType::Mem,
+            ck_tile::BaseGemmPipelineAgBgCrMem<
+                ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>,
+            ck_tile::BaseGemmPipelineAgBgCrCompV3<
+                ck_tile::
+                    GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>>;
 
         const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
         const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
@@ -85,15 +96,26 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
             constexpr bool has_hot_loop_v = has_hot_loop_.value;
             constexpr auto tail_number_v  = tail_number_.value;
 
-            using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<
-                ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                      BDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      Traits,
-                                                      Scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v>>;
+            using GemmPipeline =
+                std::conditional_t<PipelineType == GemmPipelineType::Mem,
+                                   ck_tile::GemmPipelineAgBgCrMem<
+                                       ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                             BDataType,
+                                                                             AccDataType,
+                                                                             GemmShape,
+                                                                             Traits,
+                                                                             Scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>>,
+                                   ck_tile::GemmPipelineAgBgCrCompV3<
+                                       ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                             BDataType,
+                                                                             AccDataType,
+                                                                             GemmShape,
+                                                                             Traits,
+                                                                             Scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>>>;
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKargs(args.p_a,
                                            args.p_b,
-- 
GitLab


From 0e54d7ae5a638c9c1cbdc478dd12159354cd7e97 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 17 Dec 2024 06:57:55 -0800
Subject: [PATCH 128/153] Bump rocm-docs-core from 1.11.0 to 1.12.0 in
 /docs/sphinx (#1753)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.11.0 to 1.12.0.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.11.0...v1.12.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index d1b3465b9..46a61a87f 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.11.0
+rocm-docs-core==1.12.0
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 26d0aa244..c2e74baae 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.11.0
+rocm-docs-core==1.12.0
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From 6ef8d3c295686b872d7e7a86621b68f765d98572 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 12 Dec 2024 19:47:57 +0000
Subject: [PATCH 129/153] refactor conditional usage; fix build on rocm6.1
 where the reference didn't exist

---
 include/ck/utility/amd_ck_fp8.hpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index 7b21ad646..1bdb1d078 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -18,6 +18,12 @@
 #define CK_USE_OCP_FP8 0
 #endif
 
+namespace {
+// https://en.cppreference.com/w/cpp/types/conditional
+template <bool B, class T, class F> struct conditional { using type = T; };
+template <class T, class F> struct conditional<false, T, F> { using type = F; };
+}
+
 namespace ck {
 
 using f8_fnuz_t  = _BitInt(8);
@@ -191,10 +197,10 @@ __host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
         }
     }
 
-    typename __hip_internal::conditional<
+    typename conditional<
         sizeof(T) == 2,
         unsigned short int,
-        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+        typename conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
             type>::type retval;
 
     if constexpr(we == 5 && is_half && !is_fnuz)
@@ -538,10 +544,10 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn
 
     constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10);
 
-    using T_bitwise = typename __hip_internal::conditional<
+    using T_bitwise = typename conditional<
         sizeof(T) == 2,
         unsigned short int,
-        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+        typename conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
             type>::type;
     T_bitwise x_bitwise = bit_cast<T_bitwise>(_x);
 
-- 
GitLab


From 689a5ae45be802f51fc947a9f92208dcfb143f77 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 17 Dec 2024 10:17:29 -0800
Subject: [PATCH 130/153] Pass build flags to config.h (#1760)

* pass the build flags to config.h

* fix clang format
---
 CMakeLists.txt                    |  4 ++++
 include/ck/config.h.in            | 16 ++++++++++++++++
 include/ck/utility/amd_ck_fp8.hpp | 20 +++++++++++++-------
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c8698756..be4efd3df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,14 +183,17 @@ message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     message("Enabling XDL instances")
     add_definitions(-DCK_USE_XDL)
+    set(CK_USE_XDL "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
     message("Enabling FP8 gemms on native architectures")
     add_definitions(-DCK_USE_GFX94)
+    set(CK_USE_GFX94 "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     message("Enabling WMMA instances")
     add_definitions(-DCK_USE_WMMA)
+    set(CK_USE_WMMA "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     add_definitions(-DCK_USE_OCP_FP8)
@@ -204,6 +207,7 @@ endif()
 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
 if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
     add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
+    set(CK_USE_FP8_ON_UNSUPPORTED_ARCH "ON")
 endif()
 
 # CK config file to record supported datatypes, etc.
diff --git a/include/ck/config.h.in b/include/ck/config.h.in
index 0f0b7bd60..55a498073 100644
--- a/include/ck/config.h.in
+++ b/include/ck/config.h.in
@@ -111,6 +111,22 @@
 #cmakedefine CK_USE_WMMA @CK_USE_WMMA@
 #endif
 
+#ifndef CK_USE_GFX94
+#cmakedefine CK_USE_GFX94 @CK_USE_GFX94@
+#endif
+
+#ifndef DCK_USE_OCP_FP8
+#cmakedefine DCK_USE_OCP_FP8 @DCK_USE_OCP_FP8@
+#endif
+
+#ifndef CK_USE_FNUZ_FP8
+#cmakedefine CK_USE_FNUZ_FP8 @CK_USE_FNUZ_FP8@
+#endif
+
+#ifndef CK_USE_FP8_ON_UNSUPPORTED_ARCH
+#cmakedefine CK_USE_FP8_ON_UNSUPPORTED_ARCH @CK_USE_FP8_ON_UNSUPPORTED_ARCH@
+#endif
+
 // clang-format on
 
 #endif // CK_CONFIG_H_IN
diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index 1bdb1d078..e9174904c 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -20,9 +20,17 @@
 
 namespace {
 // https://en.cppreference.com/w/cpp/types/conditional
-template <bool B, class T, class F> struct conditional { using type = T; };
-template <class T, class F> struct conditional<false, T, F> { using type = F; };
-}
+template <bool B, class T, class F>
+struct conditional
+{
+    using type = T;
+};
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+} // namespace
 
 namespace ck {
 
@@ -200,8 +208,7 @@ __host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
     typename conditional<
         sizeof(T) == 2,
         unsigned short int,
-        typename conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
-            type>::type retval;
+        typename conditional<sizeof(T) == 4, unsigned int, unsigned long long>::type>::type retval;
 
     if constexpr(we == 5 && is_half && !is_fnuz)
     {
@@ -547,8 +554,7 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn
     using T_bitwise = typename conditional<
         sizeof(T) == 2,
         unsigned short int,
-        typename conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
-            type>::type;
+        typename conditional<sizeof(T) == 4, unsigned int, unsigned long long>::type>::type;
     T_bitwise x_bitwise = bit_cast<T_bitwise>(_x);
 
     unsigned long long x{x_bitwise};
-- 
GitLab


From d9e37c6874402023f5fe033f6821bde6869c5da5 Mon Sep 17 00:00:00 2001
From: Harisankar Sadasivan <135730918+hsadasiv@users.noreply.github.com>
Date: Tue, 17 Dec 2024 10:31:21 -0800
Subject: [PATCH 131/153] updated fp16 instances to be on parity with universal
 gemm instances (#1754)

* updated fp16 instances to be on parity with universal gemm instances

* corrected instance name to streamk instance
---
 ...universal_streamk_f16_f16_f16_mk_kn_mn.hpp | 18 ++++++++++--
 ...universal_streamk_f16_f16_f16_mk_nk_mn.hpp | 29 +++++++++++++++----
 2 files changed, 39 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
 mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp

diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
old mode 100644
new mode 100755
index 6e8d5c798..5460f7f85
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
@@ -41,6 +41,8 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -49,7 +51,9 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -61,14 +65,21 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
 
-        // Latency friendly
+       // Latency friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   4,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   2,   2,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   4,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -82,6 +93,7 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
old mode 100644
new mode 100755
index e00c1733e..e716b3e85
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
@@ -42,14 +42,21 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st
         
         // Compute friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   4,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   2,   2,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // AGPR Spill
-        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        // AGPR Spill when use permuted lds layout. so, use padding for these two.
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  16,   16,    8,    8,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+     
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -68,15 +75,23 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
 
-        // Latency friendly 
+       // Latency friendly 
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   4,   4,  32,   32,    2,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   4,   4,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -84,12 +99,16 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   4,   4,  32,   32,    1,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   2,   2,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
-- 
GitLab


From f6c4d614e35b7424774160a23d8e8bef3b15faad Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Wed, 18 Dec 2024 09:45:58 +0100
Subject: [PATCH 132/153] [CK_TILE] Move hipmalloc/memcpy calls out of gpu
 reference gemm (#1743)

* [CK_TILE] Move hipmalloc/memcpy calls out of gpu reference gemm

* [CK_TILE] Move hipmalloc/memcpy calls out of gpu reference gemm - review changes

* [CK_TILE] Move hipmalloc/memcpy calls out of gpu reference gemm - review fix
---
 example/ck_tile/03_gemm/run_gemm_example.inc  |  29 +++-
 .../run_batched_gemm_example.inc              |  33 +++-
 .../ck_tile/host/reference/reference_gemm.hpp | 162 ++----------------
 3 files changed, 68 insertions(+), 156 deletions(-)

diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index a1fc15577..2b7a967ba 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -161,14 +161,39 @@ int run_gemm_example_with_layouts(int argc,
         c_m_n_gpu_ref.SetZero();
         c_m_n_gpu_buf_ref.SetZero();
 
+        ADataType* d_A;
+        BDataType* d_B;
+        CDataType* d_C;
+
+        ck_tile::hip_check_error(hipMalloc(&d_A, M * K * sizeof(ADataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_B, N * K * sizeof(BDataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_C, M * N * sizeof(CDataType)));
+
+        ck_tile::hip_check_error(hipMemcpy(d_A,
+                                           a_m_k_dev_buf.GetDeviceBuffer(),
+                                           M * K * sizeof(ADataType),
+                                           hipMemcpyHostToDevice));
+        ck_tile::hip_check_error(hipMemcpy(d_B,
+                                           b_k_n_dev_buf.GetDeviceBuffer(),
+                                           N * K * sizeof(BDataType),
+                                           hipMemcpyHostToDevice));
+
         ck_tile::reference_gemm_gpu<ADataType,
                                     BDataType,
                                     AccDataType,
                                     CDataType,
                                     ALayout,
                                     BLayout,
-                                    CLayout>(
-            a_m_k_dev_buf, b_k_n_dev_buf, c_m_n_gpu_buf_ref, M, N, K, stride_A, stride_B, stride_C);
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
+                                           d_C,
+                                           M * N * sizeof(CDataType),
+                                           hipMemcpyDeviceToHost));
+
+        ck_tile::hip_check_error(hipFree(d_A));
+        ck_tile::hip_check_error(hipFree(d_B));
+        ck_tile::hip_check_error(hipFree(d_C));
 
         c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
         pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref);
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
index dacca2042..8345eef95 100644
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -188,15 +188,33 @@ int run_batched_gemm_example_with_layouts(int argc,
         c_m_n_gpu_ref.SetZero();
         c_m_n_gpu_buf_ref.SetZero();
 
+        ADataType* d_A;
+        BDataType* d_B;
+        CDataType* d_C;
+
+        ck_tile::hip_check_error(hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType)));
+
+        ck_tile::hip_check_error(hipMemcpy(d_A,
+                                           a_m_k_dev_buf.GetDeviceBuffer(),
+                                           batch_count * M * K * sizeof(ADataType),
+                                           hipMemcpyHostToDevice));
+
+        ck_tile::hip_check_error(hipMemcpy(d_B,
+                                           b_k_n_dev_buf.GetDeviceBuffer(),
+                                           batch_count * N * K * sizeof(BDataType),
+                                           hipMemcpyHostToDevice));
+
         ck_tile::reference_batched_gemm_gpu<ADataType,
                                             BDataType,
                                             AccDataType,
                                             CDataType,
                                             ALayout,
                                             BLayout,
-                                            CLayout>(a_m_k_dev_buf,
-                                                     b_k_n_dev_buf,
-                                                     c_m_n_gpu_buf_ref,
+                                            CLayout>(d_A,
+                                                     d_B,
+                                                     d_C,
                                                      M,
                                                      N,
                                                      K,
@@ -208,6 +226,15 @@ int run_batched_gemm_example_with_layouts(int argc,
                                                      batch_stride_C,
                                                      batch_count);
 
+        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
+                                           d_C,
+                                           batch_count * M * N * sizeof(CDataType),
+                                           hipMemcpyDeviceToHost));
+
+        ck_tile::hip_check_error(hipFree(d_A));
+        ck_tile::hip_check_error(hipFree(d_B));
+        ck_tile::hip_check_error(hipFree(d_C));
+
         c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
         pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref);
 
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index 8bd1f5b04..fc412e883 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -97,9 +97,9 @@ template <typename ADataType,
           typename LayoutA,
           typename LayoutB,
           typename LayoutC>
-void reference_gemm_gpu(DeviceMem& a_device,
-                        DeviceMem& b_device,
-                        DeviceMem& c_device,
+void reference_gemm_gpu(ADataType* a_ptr,
+                        BDataType* b_ptr,
+                        CDataType* c_ptr,
                         index_t M,
                         index_t N,
                         index_t K,
@@ -107,79 +107,13 @@ void reference_gemm_gpu(DeviceMem& a_device,
                         index_t stride_b,
                         index_t stride_c)
 {
-
-    ADataType* d_A;
-    BDataType* d_B;
-    CDataType* d_C;
-
-    hipError_t errA = hipMalloc(&d_A, M * K * sizeof(ADataType));
-    hipError_t errB = hipMalloc(&d_B, N * K * sizeof(BDataType));
-    hipError_t errC = hipMalloc(&d_C, M * N * sizeof(CDataType));
-    if(errA != hipSuccess)
-    {
-        std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA)
-                  << std::endl;
-        return; // Early exit on error
-    }
-
-    if(errB != hipSuccess)
-    {
-        std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB)
-                  << std::endl;
-        return; // Early exit on error
-    }
-
-    if(errC != hipSuccess)
-    {
-        std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC)
-                  << std::endl;
-        return; // Early exit on error
-    }
-
-    errA = hipMemcpy(
-        d_A, a_device.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice);
-    if(errA != hipSuccess)
-    {
-        std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl;
-    }
-
-    errB = hipMemcpy(
-        d_B, b_device.GetDeviceBuffer(), N * K * sizeof(BDataType), hipMemcpyHostToDevice);
-    if(errB != hipSuccess)
-    {
-        std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl;
-    }
-
     int totalElements      = M * N;
     int numThreadsPerBlock = 256; // Common choice for threads per block
     int numBlocks          = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
 
     naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
-        <<<numBlocks, numThreadsPerBlock>>>(d_A, d_B, d_C, M, N, K, stride_a, stride_b, stride_c);
-    errC = hipMemcpy(
-        c_device.GetDeviceBuffer(), d_C, M * N * sizeof(CDataType), hipMemcpyDeviceToHost);
-    if(errC != hipSuccess)
-    {
-        std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl;
-    }
-
-    errA = hipFree(d_A);
-    if(errA != hipSuccess)
-    {
-        std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl;
-    }
-
-    errB = hipFree(d_B);
-    if(errB != hipSuccess)
-    {
-        std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl;
-    }
-
-    errC = hipFree(d_C);
-    if(errC != hipSuccess)
-    {
-        std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl;
-    }
+        <<<numBlocks, numThreadsPerBlock>>>(
+            a_ptr, b_ptr, c_ptr, M, N, K, stride_a, stride_b, stride_c);
 
     return;
 }
@@ -191,9 +125,9 @@ template <typename ADataType,
           typename LayoutA,
           typename LayoutB,
           typename LayoutC>
-void reference_batched_gemm_gpu(DeviceMem& a_device,
-                                DeviceMem& b_device,
-                                DeviceMem& c_device,
+void reference_batched_gemm_gpu(ADataType* a_ptr,
+                                BDataType* b_ptr,
+                                CDataType* c_ptr,
                                 index_t M,
                                 index_t N,
                                 index_t K,
@@ -205,94 +139,20 @@ void reference_batched_gemm_gpu(DeviceMem& a_device,
                                 index_t batch_stride_C,
                                 index_t batch_count)
 {
-
-    ADataType* d_A;
-    BDataType* d_B;
-    CDataType* d_C;
-
-    hipError_t errA = hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType));
-    hipError_t errB = hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType));
-    hipError_t errC = hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType));
-    if(errA != hipSuccess)
-    {
-        std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA)
-                  << std::endl;
-        return; // Early exit on error
-    }
-
-    if(errB != hipSuccess)
-    {
-        std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB)
-                  << std::endl;
-        return; // Early exit on error
-    }
-
-    if(errC != hipSuccess)
-    {
-        std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC)
-                  << std::endl;
-        return; // Early exit on error
-    }
-
-    errA = hipMemcpy(d_A,
-                     a_device.GetDeviceBuffer(),
-                     batch_count * M * K * sizeof(ADataType),
-                     hipMemcpyHostToDevice);
-    if(errA != hipSuccess)
-    {
-        std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl;
-    }
-
-    errB = hipMemcpy(d_B,
-                     b_device.GetDeviceBuffer(),
-                     batch_count * N * K * sizeof(BDataType),
-                     hipMemcpyHostToDevice);
-    if(errB != hipSuccess)
-    {
-        std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl;
-    }
-
     int totalElements      = M * N;
     int numThreadsPerBlock = 256; // Common choice for threads per block
     int numBlocks          = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
 
     for(index_t batch_id = 0; batch_id < batch_count; ++batch_id)
     {
-        ADataType* d_ATemp = d_A + batch_id * batch_stride_A;
-        BDataType* d_BTemp = d_B + batch_id * batch_stride_B;
-        CDataType* d_CTemp = d_C + batch_id * batch_stride_C;
+        ADataType* d_ATemp = a_ptr + batch_id * batch_stride_A;
+        BDataType* d_BTemp = b_ptr + batch_id * batch_stride_B;
+        CDataType* d_CTemp = c_ptr + batch_id * batch_stride_C;
         naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
             <<<numBlocks, numThreadsPerBlock>>>(
                 d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c);
     }
 
-    errC = hipMemcpy(c_device.GetDeviceBuffer(),
-                     d_C,
-                     batch_count * M * N * sizeof(CDataType),
-                     hipMemcpyDeviceToHost);
-    if(errC != hipSuccess)
-    {
-        std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl;
-    }
-
-    errA = hipFree(d_A);
-    if(errA != hipSuccess)
-    {
-        std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl;
-    }
-
-    errB = hipFree(d_B);
-    if(errB != hipSuccess)
-    {
-        std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl;
-    }
-
-    errC = hipFree(d_C);
-    if(errC != hipSuccess)
-    {
-        std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl;
-    }
-
     return;
 }
 } // namespace ck_tile
-- 
GitLab


From 1c1b336371e2367fece6b33644b36ab30d92b2d3 Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xw285@cornell.edu>
Date: Wed, 18 Dec 2024 02:32:38 -0800
Subject: [PATCH 133/153] Disambiguate bit_cast (#1749)

Adding namespace to disambiguate with std::bit_cast

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 include/ck_tile/core/container/meta_data_buffer.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ck_tile/core/container/meta_data_buffer.hpp b/include/ck_tile/core/container/meta_data_buffer.hpp
index 7493b93d8..eba60fac7 100644
--- a/include/ck_tile/core/container/meta_data_buffer.hpp
+++ b/include/ck_tile/core/container/meta_data_buffer.hpp
@@ -30,7 +30,7 @@ struct meta_data_buffer
         {
             constexpr index_t size = sizeof(T);
 
-            auto tmp = bit_cast<array<std::byte, size>>(data);
+            auto tmp = ck_tile::bit_cast<array<std::byte, size>>(data);
 
             for(int i = 0; i < size; i++)
             {
@@ -66,7 +66,7 @@ struct meta_data_buffer
                 pos++;
             }
 
-            data = bit_cast<T>(tmp);
+            data = ck_tile::bit_cast<T>(tmp);
         }
 
         return data;
@@ -86,7 +86,7 @@ struct meta_data_buffer
             pos++;
         }
 
-        auto data = bit_cast<T>(tmp);
+        auto data = ck_tile::bit_cast<T>(tmp);
 
         return data;
     }
-- 
GitLab


From 453ca373479e1c3510bff66c03a773a29f1caada Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Wed, 18 Dec 2024 17:52:46 +0100
Subject: [PATCH 134/153] [CK TILE] Refactor GemmKernel to be reused by other
 GEMM related operators (#1730)

* Gemm Kernel Refactor part1

* Gemm Kernel Refactor common gemm pipeline part2

* [CK TILE] Refactor batched gemm to reuse GemmKernel

* [CK TILE] Refactor GemmKernel - review changes part1

* [CK TILE] Refactor GemmKernel - references fix

* [CK TILE] Refactor GemmKernel - naming changes, add problem

* [CK_TILE] Refactor GemmKernel - update tests

* [CK_TILE] Refactor GemmKernel - review changes

* [CK_TILE] Refactor GemmKernel - update test

* [CK_TILE] Refactor GemmKernel - constness fixes

* [CK_TILE] Refactor GemmKernel - update tests
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |  16 +-
 example/ck_tile/03_gemm/gemm_basic.hpp        |  16 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  10 +-
 .../ck_tile/16_batched_gemm/batched_gemm.cpp  |   6 +-
 .../ck_tile/16_batched_gemm/batched_gemm.hpp  |   6 +-
 .../run_batched_gemm_example.inc              |   2 +-
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   | 274 +++++-------------
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 259 ++++++++++++-----
 .../batched_gemm/test_batched_gemm_util.hpp   |  42 ++-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  40 +--
 10 files changed, 300 insertions(+), 371 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index f5260c306..4c630375f 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -15,7 +15,7 @@
 #include "gemm_basic.hpp"
 
 template <typename ALayout, typename BLayout, typename CLayout>
-float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
+float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
     // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
     constexpr bool kPadM = false;
@@ -79,17 +79,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
     // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
     using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
-    auto kargs = Kernel::MakeKargs(args.p_a,
-                                   args.p_b,
-                                   args.p_c,
-                                   args.M,
-                                   args.N,
-                                   args.K,
-                                   args.stride_A,
-                                   args.stride_B,
-                                   args.stride_C);
-
-    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
+    auto kargs = Kernel::MakeKernelArgs(args);
+
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
     constexpr dim3 blocks = Kernel::BlockSize();
 
     if(!Kernel::IsSupportedArgument(kargs))
diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp
index 23e99bc2a..58cdaea7d 100644
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
@@ -51,20 +51,6 @@ using BDataType   = Types::BDataType;
 using AccDataType = Types::AccDataType;
 using CDataType   = Types::CDataType;
 
-struct gemm_basic_args
-{
-    const void* p_a;
-    const void* p_b;
-    void* p_c;
-    ck_tile::index_t kbatch;
-    ck_tile::index_t M;
-    ck_tile::index_t N;
-    ck_tile::index_t K;
-    ck_tile::index_t stride_A;
-    ck_tile::index_t stride_B;
-    ck_tile::index_t stride_C;
-};
-
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
@@ -89,4 +75,4 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
-float gemm_calc(gemm_basic_args args, const ck_tile::stream_config& s);
+float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 2b7a967ba..68df389bf 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -16,11 +16,11 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   int n_warmup,
                   int n_repeat)
 {
-    gemm_basic_args args;
-    args.p_a      = a_m_k_dev_buf.GetDeviceBuffer();
-    args.p_b      = b_k_n_dev_buf.GetDeviceBuffer();
-    args.p_c      = c_m_n_dev_buf.GetDeviceBuffer();
-    args.kbatch   = kbatch;
+    ck_tile::GemmHostArgs args;
+    args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
+    args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
+    args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
+    args.k_batch  = kbatch;
     args.M        = M;
     args.N        = N;
     args.K        = K;
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index bfdd74126..9b4ed9a9e 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -16,7 +16,7 @@
 #include "batched_gemm.hpp"
 
 template <typename ALayout, typename BLayout, typename CLayout>
-float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s)
+float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s)
 {
     // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
     constexpr bool kPadM        = false;
@@ -79,9 +79,9 @@ float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config&
     // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
     using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
-    auto kargs = Kernel::MakeKargs(args);
+    auto kargs = Kernel::MakeKernelArgs(args);
 
-    const dim3 grids      = Kernel::GridSize(args);
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.batch_count);
     constexpr dim3 blocks = Kernel::BlockSize();
 
     if(s.log_level_ > 0)
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
index e252c0f67..f0c0c9efb 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
@@ -29,10 +29,6 @@ using BDataType   = Types::BDataType;
 using AccDataType = Types::AccDataType;
 using CDataType   = Types::CDataType;
 
-struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs
-{
-};
-
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
@@ -60,4 +56,4 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
-float batched_gemm(batched_gemm_kargs args, const ck_tile::stream_config& s);
+float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
index 8345eef95..4e7218b5b 100644
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -20,7 +20,7 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                           int n_warmup,
                           int n_repeat)
 {
-    batched_gemm_kargs args;
+    ck_tile::BatchedGemmHostArgs args;
     args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
     args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
     args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index 07b4af573..07a4cf8fb 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -3,90 +3,93 @@
 
 #pragma once
 
-#include <iostream>
-#include <string>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 
 namespace ck_tile {
 
-struct BatchedGemmHostArgs
+struct BatchedGemmHostArgs : public ck_tile::GemmHostArgs
 {
-    const void* a_ptr;
-    const void* b_ptr;
-    void* c_ptr;
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-    index_t batch_stride_A;
-    index_t batch_stride_B;
-    index_t batch_stride_C;
-    index_t batch_count;
+    CK_TILE_HOST BatchedGemmHostArgs() = default;
+    CK_TILE_HOST BatchedGemmHostArgs(const void* a_ptr_,
+                                     const void* b_ptr_,
+                                     void* c_ptr_,
+                                     ck_tile::index_t k_batch_,
+                                     ck_tile::index_t M_,
+                                     ck_tile::index_t N_,
+                                     ck_tile::index_t K_,
+                                     ck_tile::index_t stride_A_,
+                                     ck_tile::index_t stride_B_,
+                                     ck_tile::index_t stride_C_,
+                                     ck_tile::index_t batch_stride_A_,
+                                     ck_tile::index_t batch_stride_B_,
+                                     ck_tile::index_t batch_stride_C_,
+                                     ck_tile::index_t batch_count_)
+        : GemmHostArgs(
+              a_ptr_, b_ptr_, c_ptr_, k_batch_, M_, N_, K_, stride_A_, stride_B_, stride_C_),
+          batch_stride_A(batch_stride_A_),
+          batch_stride_B(batch_stride_B_),
+          batch_stride_C(batch_stride_C_),
+          batch_count(batch_count_)
+    {
+    }
+
+    ck_tile::index_t batch_stride_A;
+    ck_tile::index_t batch_stride_B;
+    ck_tile::index_t batch_stride_C;
+    ck_tile::index_t batch_count;
 };
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
-struct BatchedGemmKernel
+struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>
 {
-    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
-    using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
-    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
-    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
-    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    using Base = GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
 
-    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
-    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
-    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using GemmKernelArgs = typename Base::GemmKernelArgs;
 
-    struct BatchedGemmKargs
+    using ADataType = typename Base::ADataType;
+    using BDataType = typename Base::BDataType;
+    using CDataType = typename Base::CDataType;
+
+    using TilePartitioner  = typename Base::TilePartitioner;
+    using GemmPipeline     = typename Base::GemmPipeline;
+    using EpiloguePipeline = typename Base::EpiloguePipeline;
+    using ALayout          = typename Base::ALayout;
+    using BLayout          = typename Base::BLayout;
+    using CLayout          = typename Base::CLayout;
+
+    struct BatchedGemmKernelArgs : GemmKernelArgs
     {
-        const void* a_ptr;
-        const void* b_ptr;
-        void* c_ptr;
-        index_t M;
-        index_t N;
-        index_t K;
-        index_t stride_A;
-        index_t stride_B;
-        index_t stride_C;
         index_t batch_stride_A;
         index_t batch_stride_B;
         index_t batch_stride_C;
         index_t batch_count;
     };
 
-    using Kargs = BatchedGemmKargs;
-    using Hargs = BatchedGemmHostArgs;
+    using KernelArgs = BatchedGemmKernelArgs;
 
-    __host__ static constexpr auto GridSize(const Hargs& h)
+    __host__ static constexpr auto GridSize(index_t M, index_t N, index_t batch_count)
     {
-        return TilePartitioner::GridSize(h.M, h.N, h.batch_count);
+        return TilePartitioner::GridSize(M, N, batch_count);
     }
 
-    __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+    __host__ static constexpr auto BlockSize() { return dim3(Base::KernelBlockSize); }
 
-    CK_TILE_HOST static constexpr BatchedGemmKargs MakeKargs(const Hargs& h)
+    CK_TILE_HOST static constexpr BatchedGemmKernelArgs
+    MakeKernelArgs(const BatchedGemmHostArgs& hostArgs)
     {
-        Kargs k;
-        k.a_ptr          = h.a_ptr;
-        k.b_ptr          = h.b_ptr;
-        k.c_ptr          = h.c_ptr;
-        k.M              = h.M;
-        k.N              = h.N;
-        k.K              = h.K;
-        k.stride_A       = h.stride_A;
-        k.stride_B       = h.stride_B;
-        k.stride_C       = h.stride_C;
-        k.batch_stride_A = h.batch_stride_A;
-        k.batch_stride_B = h.batch_stride_B;
-        k.batch_stride_C = h.batch_stride_C;
-        k.batch_count    = h.batch_count;
-        return k;
+        return BatchedGemmKernelArgs{{hostArgs.a_ptr,
+                                      hostArgs.b_ptr,
+                                      hostArgs.c_ptr,
+                                      hostArgs.M,
+                                      hostArgs.N,
+                                      hostArgs.K,
+                                      hostArgs.stride_A,
+                                      hostArgs.stride_B,
+                                      hostArgs.stride_C},
+                                     hostArgs.batch_stride_A,
+                                     hostArgs.batch_stride_B,
+                                     hostArgs.batch_stride_C,
+                                     hostArgs.batch_count};
     }
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
@@ -94,7 +97,7 @@ struct BatchedGemmKernel
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
-    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
     {
         const auto [i_m, i_n] = TilePartitioner{}();
         const auto i_batch    = __builtin_amdgcn_readfirstlane(blockIdx.z);
@@ -102,156 +105,17 @@ struct BatchedGemmKernel
         //  options
         const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
         const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
-        const ADataType* a_start  = static_cast<const ADataType*>(kargs.a_ptr);
+        const ADataType* a_ptr    = static_cast<const ADataType*>(kargs.a_ptr) + batch_offset_A;
 
         const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
         const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
-        const BDataType* b_start  = static_cast<const BDataType*>(kargs.b_ptr);
-
-        // Convert pointers to tensor views
-        auto a_tensor_view = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_start + batch_offset_A,
-                    make_tuple(kargs.M, kargs.K),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::VectorSizeA>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_start + batch_offset_A,
-                    make_tuple(kargs.M, kargs.K),
-                    make_tuple(1, kargs.stride_A),
-                    number<1>{},
-                    number<1>{});
-            }
-        }();
-
-        auto b_tensor_view = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    b_start + batch_offset_B,
-                    make_tuple(kargs.N, kargs.K),
-                    make_tuple(1, kargs.stride_B),
-                    number<1>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    b_start + batch_offset_B,
-                    make_tuple(kargs.N, kargs.K),
-                    make_tuple(kargs.stride_B, 1),
-                    number<GemmPipeline::VectorSizeB>{},
-                    number<1>{});
-            }
-        }();
-
-        auto a_pad_view = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(
-                    a_tensor_view,
-                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
-                    sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(
-                    a_tensor_view,
-                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
-                    sequence<GemmPipeline::kPadM, false>{});
-            }
-        }();
-        // clang-format on
-
-        auto a_block_window = make_tile_window(
-            a_pad_view,
-            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
-            {i_m, 0});
-
-        auto b_pad_view = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
-            {
-                return pad_tensor_view(
-                    b_tensor_view,
-                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
-                    sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(
-                    b_tensor_view,
-                    make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
-                    sequence<GemmPipeline::kPadN, false>{});
-            }
-        }();
-        // clang-format on
-
-        auto b_block_window = make_tile_window(
-            b_pad_view,
-            make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
-            {i_n, 0});
-
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
-
-        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
-
-        // Run GEMM cooperatively by whole wokrgroup.
-        auto c_block_tile =
-            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
+        const BDataType* b_ptr    = static_cast<const BDataType*>(kargs.b_ptr) + batch_offset_B;
 
         const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C);
         const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C);
-        CDataType* c_start        = static_cast<CDataType*>(kargs.c_ptr);
-        auto c_tensor_view        = [&]() {
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    c_start + batch_offset_C,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_C, 1),
-                    number<GemmPipeline::VectorSizeC>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    c_start + batch_offset_C,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_C),
-                    number<1>{},
-                    number<1>{});
-            }
-        }();
-
-        auto c_pad_view = [&]() {
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(
-                    c_tensor_view,
-                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
-                    sequence<false, GemmPipeline::kPadN>{});
-            }
-            else
-            {
-                return pad_tensor_view(
-                    c_tensor_view,
-                    make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
-                    sequence<GemmPipeline::kPadM, false>{});
-            }
-        }();
-        auto c_block_window = make_tile_window(
-            c_pad_view,
-            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
-            {i_m, i_n});
+        CDataType* c_ptr          = static_cast<CDataType*>(kargs.c_ptr) + batch_offset_C;
 
-        EpiloguePipeline{}(c_block_window, c_block_tile);
+        this->RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n);
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 763d8cad9..925648a88 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -12,6 +12,50 @@
 
 namespace ck_tile {
 
+struct GemmProblem
+{
+    CK_TILE_HOST GemmProblem() = default;
+    CK_TILE_HOST GemmProblem(
+        index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_)
+        : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_)
+    {
+    }
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+
+struct GemmHostArgs : public GemmProblem
+{
+    CK_TILE_HOST GemmHostArgs() = default;
+    CK_TILE_HOST GemmHostArgs(const void* a_ptr_,
+                              const void* b_ptr_,
+                              void* c_ptr_,
+                              index_t k_batch_,
+                              index_t M_,
+                              index_t N_,
+                              index_t K_,
+                              index_t stride_A_,
+                              index_t stride_B_,
+                              index_t stride_C_)
+        : GemmProblem(M_, N_, K_, stride_A_, stride_B_, stride_C_),
+          a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          c_ptr(c_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_ptr;
+    void* c_ptr;
+    index_t k_batch;
+};
+
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
 struct GemmKernel
 {
@@ -25,9 +69,12 @@ struct GemmKernel
 
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
-    // using CAccDataType = remove_cvref_t<typename GemmPipeline::CDataType>;
     using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+
     __host__ static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
     {
         return TilePartitioner::GridSize(M, N, KBatch);
@@ -35,7 +82,7 @@ struct GemmKernel
 
     __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
-    struct GemmCommonKargs
+    struct GemmKernelArgs
     {
         const void* a_ptr;
         const void* b_ptr;
@@ -48,25 +95,37 @@ struct GemmKernel
         index_t stride_C;
     };
 
-    CK_TILE_HOST static constexpr GemmCommonKargs MakeKargs(const void* a_ptr,
-                                                            const void* b_ptr,
-                                                            void* c_ptr,
-                                                            index_t M,
-                                                            index_t N,
-                                                            index_t K,
-                                                            index_t stride_A,
-                                                            index_t stride_B,
-                                                            index_t stride_C)
+    CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs)
     {
-        return GemmCommonKargs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C};
+        return GemmKernelArgs{hostArgs.a_ptr,
+                              hostArgs.b_ptr,
+                              hostArgs.c_ptr,
+                              hostArgs.M,
+                              hostArgs.N,
+                              hostArgs.K,
+                              hostArgs.stride_A,
+                              hostArgs.stride_B,
+                              hostArgs.stride_C};
     }
+    // CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const void* a_ptr,
+    //                                                             const void* b_ptr,
+    //                                                             void* c_ptr,
+    //                                                             index_t M,
+    //                                                             index_t N,
+    //                                                             index_t K,
+    //                                                             index_t stride_A,
+    //                                                             index_t stride_B,
+    //                                                             index_t stride_C)
+    // {
+    //     return GemmKernelArgs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C};
+    // }
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
-    CK_TILE_HOST static bool IsSupportedArgument(const GemmCommonKargs& kargs)
+    CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs)
     {
         if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
         {
@@ -139,18 +198,16 @@ struct GemmKernel
         return true;
     }
 
-    CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const
+    CK_TILE_DEVICE auto MakeGemmTensorViews(const ADataType* a_ptr,
+                                            const BDataType* b_ptr,
+                                            CDataType* c_ptr,
+                                            const GemmKernelArgs& kargs) const
     {
-        const auto [i_m, i_n] = TilePartitioner{}();
-        // options
-        const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr);
-        const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
-        // Convert pointers to tensor views
-        auto a_tensor_view = [&]() {
+        const auto& a_tensor_view = [&]() {
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global>(
-                    a_start,
+                    a_ptr,
                     make_tuple(kargs.M, kargs.K),
                     make_tuple(kargs.stride_A, 1),
                     number<GemmPipeline::VectorSizeA>{},
@@ -159,7 +216,7 @@ struct GemmKernel
             else
             {
                 return make_naive_tensor_view<address_space_enum::global>(
-                    a_start,
+                    a_ptr,
                     make_tuple(kargs.M, kargs.K),
                     make_tuple(1, kargs.stride_A),
                     number<1>{},
@@ -167,11 +224,11 @@ struct GemmKernel
             }
         }();
 
-        auto b_tensor_view = [&]() {
+        const auto& b_tensor_view = [&]() {
             if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global>(
-                    b_start,
+                    b_ptr,
                     make_tuple(kargs.N, kargs.K),
                     make_tuple(1, kargs.stride_B),
                     number<1>{},
@@ -180,7 +237,7 @@ struct GemmKernel
             else
             {
                 return make_naive_tensor_view<address_space_enum::global>(
-                    b_start,
+                    b_ptr,
                     make_tuple(kargs.N, kargs.K),
                     make_tuple(kargs.stride_B, 1),
                     number<GemmPipeline::VectorSizeB>{},
@@ -188,7 +245,35 @@ struct GemmKernel
             }
         }();
 
-        auto a_pad_view = [&]() {
+        const auto& c_tensor_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<GemmPipeline::VectorSizeC>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        return make_tuple(a_tensor_view, b_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE auto MakeGemmPadViews(const TensorView& views) const
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
             {
                 return pad_tensor_view(
@@ -204,14 +289,9 @@ struct GemmKernel
                     sequence<GemmPipeline::kPadM, false>{});
             }
         }();
-        // clang-format on
-
-        auto a_block_window = make_tile_window(
-            a_pad_view,
-            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
-            {i_m, 0});
 
-        auto b_pad_view = [&]() {
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I1);
             if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
             {
                 return pad_tensor_view(
@@ -228,43 +308,8 @@ struct GemmKernel
             }
         }();
 
-        auto b_block_window = make_tile_window(
-            b_pad_view,
-            make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
-            {i_n, 0});
-
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
-
-        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
-
-        // Run GEMM cooperatively by whole wokrgroup.
-        auto c_block_tile =
-            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
-
-        CDataType* c_start = static_cast<CDataType*>(kargs.c_ptr);
-        auto c_tensor_view = [&]() {
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    c_start,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_C, 1),
-                    number<GemmPipeline::VectorSizeC>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    c_start,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_C),
-                    number<1>{},
-                    number<1>{});
-            }
-        }();
-
-        auto c_pad_view = [&]() {
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I2);
             if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
             {
                 return pad_tensor_view(
@@ -280,12 +325,82 @@ struct GemmKernel
                     sequence<GemmPipeline::kPadM, false>{});
             }
         }();
-        auto CBlockWindow_pad = make_tile_window(
+
+        return make_tuple(a_pad_view, b_pad_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n) const
+    {
+        const auto& a_pad_view     = views.at(I0);
+        const auto& a_block_window = make_tile_window(
+            a_pad_view,
+            make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kK>{}),
+            {i_m, 0});
+
+        const auto& b_pad_view     = views.at(I1);
+        const auto& b_block_window = make_tile_window(
+            b_pad_view,
+            make_tuple(number<TilePartitioner::kN>{}, number<TilePartitioner::kK>{}),
+            {i_n, 0});
+
+        const auto& c_pad_view = views.at(I2);
+        auto c_block_window    = make_tile_window(
             c_pad_view,
             make_tuple(number<TilePartitioner::kM>{}, number<TilePartitioner::kN>{}),
             {i_m, i_n});
 
-        EpiloguePipeline{}(CBlockWindow_pad, c_block_tile);
+        return make_tuple(a_block_window, b_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param kargs GEMM kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     */
+    CK_TILE_DEVICE void RunGemm(const ADataType* a_ptr,
+                                const BDataType* b_ptr,
+                                CDataType* c_ptr,
+                                const GemmKernelArgs& kargs,
+                                const index_t block_idx_m,
+                                const index_t block_idx_n) const
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews(a_ptr, b_ptr, c_ptr, kargs);
+        const auto& gemm_pad_views          = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& c_block_tile =
+            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I2);
+        EpiloguePipeline{}(c_block_window, c_block_tile);
+    }
+
+    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
+    {
+        const auto [i_m, i_n] = TilePartitioner{}();
+        // options
+        const ADataType* a_ptr = static_cast<const ADataType*>(kargs.a_ptr);
+        const BDataType* b_ptr = static_cast<const BDataType*>(kargs.b_ptr);
+        CDataType* c_ptr       = static_cast<CDataType*>(kargs.c_ptr);
+
+        RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n);
     }
 };
 
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index 88145b987..d3f307787 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -24,12 +24,9 @@ class TestCkTileBatchedGemm : public ::testing::Test
     using AccDataType = std::tuple_element_t<5, Tuple>;
     using CDataType   = std::tuple_element_t<6, Tuple>;
 
-    struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs
-    {
-    };
-
     template <typename ALayout, typename BLayout, typename CLayout>
-    void invoke_batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s)
+    void invoke_batched_gemm(const ck_tile::BatchedGemmHostArgs& args,
+                             const ck_tile::stream_config& s)
     {
         // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
         constexpr bool kPadM        = false;
@@ -94,9 +91,9 @@ class TestCkTileBatchedGemm : public ::testing::Test
         using Kernel =
             ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
-        auto kargs = Kernel::MakeKargs(args);
+        auto kargs = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args);
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.batch_count);
         constexpr dim3 blocks = Kernel::BlockSize();
 
         if(s.log_level_ > 0)
@@ -185,21 +182,22 @@ class TestCkTileBatchedGemm : public ::testing::Test
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        batched_gemm_kargs kargs{a_m_k_dev_buf.GetDeviceBuffer(),
-                                 b_k_n_dev_buf.GetDeviceBuffer(),
-                                 c_m_n_dev_buf.GetDeviceBuffer(),
-                                 M,
-                                 N,
-                                 K,
-                                 StrideA,
-                                 StrideB,
-                                 StrideC,
-                                 BatchStrideA,
-                                 BatchStrideB,
-                                 BatchStrideC,
-                                 BatchCount};
-
-        invoke_batched_gemm<ALayout, BLayout, CLayout>(kargs,
+        ck_tile::BatchedGemmHostArgs args;
+        args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
+        args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
+        args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
+        args.M              = M;
+        args.N              = N;
+        args.K              = K;
+        args.stride_A       = StrideA;
+        args.stride_B       = StrideB;
+        args.stride_C       = StrideC;
+        args.batch_stride_A = BatchStrideA;
+        args.batch_stride_B = BatchStrideB;
+        args.batch_stride_C = BatchStrideC;
+        args.batch_count    = BatchCount;
+
+        invoke_batched_gemm<ALayout, BLayout, CLayout>(args,
                                                        ck_tile::stream_config{nullptr, false});
 
         std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index a51498602..53ead4d8d 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -31,22 +31,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
     static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
     // TODO: expose tile size through test t-param ?
 
-    struct gemm_args
-    {
-        const void* p_a;
-        const void* p_b;
-        void* p_c;
-        ck_tile::index_t kbatch;
-        ck_tile::index_t M;
-        ck_tile::index_t N;
-        ck_tile::index_t K;
-        ck_tile::index_t stride_A;
-        ck_tile::index_t stride_B;
-        ck_tile::index_t stride_C;
-    };
-
     template <bool PadM, bool PadN, bool PadK>
-    void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s)
+    void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     {
         // TODO: This should be parameterized in tests
         constexpr ck_tile::index_t M_Tile = 128;
@@ -117,17 +103,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                                              has_hot_loop_v,
                                                                              tail_number_v>>>;
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKargs(args.p_a,
-                                           args.p_b,
-                                           args.p_c,
-                                           args.M,
-                                           args.N,
-                                           args.K,
-                                           args.stride_A,
-                                           args.stride_B,
-                                           args.stride_C);
-
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
             constexpr dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
@@ -319,11 +297,11 @@ class TestCkTileGemmPipeline : public ::testing::Test
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        gemm_args args;
-        args.p_a      = a_m_k_dev_buf.GetDeviceBuffer();
-        args.p_b      = b_k_n_dev_buf.GetDeviceBuffer();
-        args.p_c      = c_m_n_dev_buf.GetDeviceBuffer();
-        args.kbatch   = kbatch;
+        ck_tile::GemmHostArgs args;
+        args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
+        args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
+        args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
+        args.k_batch  = kbatch;
         args.M        = M;
         args.N        = N;
         args.K        = K;
-- 
GitLab


From e758d006a55dd45ee9aae009b5ab554d42736dfb Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Thu, 19 Dec 2024 17:55:35 +0100
Subject: [PATCH 135/153] Apply Ck-tile argument parser for vectors [I/O]
 (#1758)

* Parser for a vector was added. Additionaly we valid correctnes of numbers

* Remove unnecessary comments

* Review part 1

* Review part 2

* Add const to variadic lambda

* Rename C->K
---
 .../ck_tile/17_grouped_gemm/grouped_gemm.hpp  | 20 +++++---
 .../run_grouped_gemm_example.inc              | 34 ++++++++------
 include/ck_tile/host/arg_parser.hpp           | 46 ++++++++++++++++++-
 3 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 94af4711d..20ba74088 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -34,13 +34,19 @@ using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("a_layout", "R", "A tensor data layout - Row by default")
-        .insert("b_layout", "R", "B tensor data layout - Row by default")
-        .insert("c_layout", "R", "C tensor data layout - Row by default")
-        .insert("validate", "1", "0. No validation, 1. Validation on CPU")
-        .insert("warmup", "10", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("group_count", "16", "group count");
+    arg_parser.insert("Ms", "", "M dimensions - empty by default.")
+        .insert("Ns", "", "N dimensions - empty by default.")
+        .insert("Ks", "", "K dimensions - empty by default.")
+        .insert("stride_As", "", "Tensor A strides - it is empty by default.")
+        .insert("stride_Bs", "", "Tensor B strides - it is empty by default.")
+        .insert("stride_Cs", "", "Tensor C strides - it is empty by default.")
+        .insert("a_layout", "R", "A tensor data layout - Row by default.")
+        .insert("b_layout", "R", "B tensor data layout - Row by default.")
+        .insert("c_layout", "R", "C tensor data layout - Row by default.")
+        .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
+        .insert("warmup", "10", "number of iterations before benchmark the kernel.")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel.")
+        .insert("group_count", "16", "group count.");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index cd5b1c286..11faa6642 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -53,26 +53,34 @@ int run_grouped_gemm_example_with_layouts(int argc,
         return -1;
     };
 
+    auto valid_input_data = [&](int group_count, const auto&... args) {
+        return !(args.empty() || ...) && group_count == (args.size() == ...);
+    };
+
     const int group_count = arg_parser.get_int("group_count");
     const int repeat      = arg_parser.get_int("repeat");
     const int warmup      = arg_parser.get_int("warmup");
 
-    std::vector<ck_tile::index_t> Ms;
-    std::vector<ck_tile::index_t> Ns;
-    std::vector<ck_tile::index_t> Ks;
-    std::vector<ck_tile::index_t> stride_As;
-    std::vector<ck_tile::index_t> stride_Bs;
-    std::vector<ck_tile::index_t> stride_Cs;
+    std::vector<ck_tile::index_t> Ms        = arg_parser.get_int_vec("Ms");
+    std::vector<ck_tile::index_t> Ns        = arg_parser.get_int_vec("Ns");
+    std::vector<ck_tile::index_t> Ks        = arg_parser.get_int_vec("Ks");
+    std::vector<ck_tile::index_t> stride_As = arg_parser.get_int_vec("stride_As");
+    std::vector<ck_tile::index_t> stride_Bs = arg_parser.get_int_vec("stride_Bs");
+    std::vector<ck_tile::index_t> stride_Cs = arg_parser.get_int_vec("stride_Cs");
 
-    for(int i = 0; i < group_count; i++)
+    if(!valid_input_data(group_count, Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs))
     {
-        Ms.push_back(256 + 256 * i);
-        Ns.push_back(128 + 128 * i);
-        Ks.push_back(128 + 64 * i);
+        std::cout << "Please check the input data. Default values will be used." << std::endl;
+        for(int i = 0; i < group_count; i++)
+        {
+            Ms.push_back(256 + 256 * i);
+            Ns.push_back(128 + 128 * i);
+            Ks.push_back(128 + 64 * i);
 
-        stride_As.push_back(Ks[i]);
-        stride_Bs.push_back(Ks[i]);
-        stride_Cs.push_back(Ns[i]);
+            stride_As.push_back(Ks[i]);
+            stride_Bs.push_back(Ks[i]);
+            stride_Cs.push_back(Ns[i]);
+        }
     }
 
     std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
diff --git a/include/ck_tile/host/arg_parser.hpp b/include/ck_tile/host/arg_parser.hpp
index 3765156df..df309f312 100644
--- a/include/ck_tile/host/arg_parser.hpp
+++ b/include/ck_tile/host/arg_parser.hpp
@@ -15,11 +15,14 @@
 
 namespace ck_tile {
 /*
- * a host side utility, arg parser for
- *  -[key0]=[value0] -[key1]=[value1] ...
+ * a host side utility, arg parser for, either
+ * -[key0] = [value0, value1, value2]
+ * or
+ * -[key0]=[value0] -[key1]=[value1] ...
  */
 class ArgParser
 {
+
     public:
     class Arg
     {
@@ -187,6 +190,45 @@ class ArgParser
         return value;
     }
 
+    std::vector<std::string> get_string_vec(const std::string& name,
+                                            const std::string& delimiter = ",") const
+    {
+        if(get_str(name).empty())
+        {
+            return {};
+        }
+        std::string s = get_str(name);
+        std::vector<std::string> tokens;
+        size_t pos = 0;
+        std::string token;
+        while((pos = s.find(delimiter)) != std::string::npos)
+        {
+            token = s.substr(0, pos);
+            tokens.push_back(token);
+            s.erase(0, pos + delimiter.length());
+        }
+        tokens.push_back(s);
+
+        return tokens;
+    }
+
+    std::vector<int> get_int_vec(const std::string& name, const std::string& delimiter = ",") const
+    {
+        if(get_str(name).empty())
+        {
+            return {};
+        }
+        const std::vector<std::string> args = get_string_vec(name, delimiter);
+        std::vector<int> tokens;
+        tokens.reserve(static_cast<int>(args.size()));
+        for(const std::string& token : args)
+        {
+            int value = atoi(token.c_str());
+            tokens.push_back(value);
+        }
+        return tokens;
+    }
+
     private:
     std::unordered_map<std::string, Arg> input_map;
     std::vector<std::string> keys;
-- 
GitLab


From 2944c508941055a0cf36d5a96092d6c739f53c36 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 19 Dec 2024 17:24:05 -0800
Subject: [PATCH 136/153] fix profiler_grouped_gemm (#1766)

---
 profiler/include/profiler/profile_grouped_gemm_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index c10cd0ea9..367e94de1 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -77,7 +77,7 @@ bool profile_grouped_gemm_impl(int do_verification,
     std::vector<Tensor<CDataType>> c_m_n_host_results;
     std::vector<Tensor<CDataType>> c_m_n_device_results;
 
-    ComputeDataType max_abs_in_val = 0.f;
+    double max_abs_in_val = 0.f;
     for(std::size_t i = 0; i < group_count; i++)
     {
         a_m_k.push_back(
-- 
GitLab


From 37cdbf4f0ec88ba5064f46c3370633b5950bc7ae Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Fri, 20 Dec 2024 14:41:01 +0800
Subject: [PATCH 137/153] [CK_TILE] Add fmha fwd N-Warp S-Shuffle pipeline
 (fmha fwd splitkv pipeline variant) (#1705)

* Add check for zero values

* Add static assertions

* Remove invalid option '-e' in smoke_test.sh

* Use correct path of smoke_test.sh

* Avoid zero-sized shared memory array

* Add warning comment

* Replace expr by integer_divide_ceil() call

* Use more readable constant names

* Write down assumption as static assertion

* Add more diagnostic error messages

* Fix wrong BlockWarps when using default pipeline policy

* Add more static assertions for A LDS desc

* Allow using vector size < 8 for data type fp16/bf16

* Align vector size between DRAM dist & LDS desc

* Remove no-longer used func decl

* Fix wrong displayed piepline name

* Undo policy template changes for tile_example_gemm_basic

* Add missing space and make error message stands out

* Unify print precision

* Add missing include directive <iomanip>

* Replace constant 64 by get_warp_size() call

* Replace constant 128 by named variable: BankLength

* Add kAMBlock/kBNBlock attributes

* Allow usig different A/B warp dist for multiple blocks

* Add helper function to get warp dist encodings

* Add 4x64x4 fp16 warp gemm attribute impl

* Complete the A/B warp dist encoding logic

* Fix wrong thread mapping for C matrix

* Use smaller vector size for small tile

* Add static assert to block unsupported warp gemm impl

* Extract common code out as helper method

* Add 4x64x16 fp16 warp gemm type alias

* Add comment to warning developers

* Undo WarpGemmAtrributeMfma<> changes

* Use more clear static assertion error message

* Add trivial wrapper to get warp dstr encodings

* Only transpose warp gemm result if it's square

* Fix compilation error

* Support multi-block warp gemm (on N direction)

* Remove duplicated code

* Fix output encoding of warp gemm

* Fix wrong shape of WarpGemmAtrributeMfmaIterateK<>

* Remove unused code

* Fix wrong shape of WarpGemmAttributeMfmaImplF16F16F32M4N64K4

* Add type config for bf16_t

* Add 4x64x16 bf16 warp gemm

* Update WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution

* Add 64x4x4 fp16/bf16 warp gemm impl

* Add 64x4x16 fp16/bf16 warp gemm

* Add static assertion for better error diagnostic

* Get Q dram dstr directly form block gemm

* Add missing header: fused_moe.hpp

* Allow specifying different warp-gemm for gemm0 & gemm1

* Store P matrix into LDS before gemm1

* Fix inconsistant kernel name

* Remove constraint on gemm0 & gemm1 block warps

* Remove unsupported vector size from checking list

* Allow using 4x64x16 warp gemm for gemm0

* Finish policy customization

* Finish pipeline modification
F#

* Use block warps in codegen

* Fix wrong rank of m_lds_window origin

* Use better distributed tensor

* Make P-store earlier

* Remove duplicated experssions

* Remove unnecessary tile window

* Create new files for new splitkv pipeline

* Separate old/new pipeline codegen logic

* Sync changes form develop

* Undo gemm kernel/pipeline changes

* Undo gemm example changes

* Remove blank lines

* Fix typo

* Use new warp gemm interface

* Fix link error

* Fix wrong pipeline tag

* Fix more link error

* Avoid unnecessary padding

* Always use vector load for K

* Padding on fastest dimension when necessary

* Force padding Q on hdim_q

* Set high dimension padding flag to false

* Re-format headers

* Use warps=<1, 4, 1> for both gemm0 & gemm1

* Fix complilation errors

* Remove m/l shuffle logics

* Ignore duplicate data when write lse_acc

* Use gemm0 block warps as lds tile width

* Remove hard-coded numbers

* Fix wrong distribution width

* Remove unnecessary code

* Add s_barrier before writing to LDS

* Store Q into LDS before gemm0

* Fix wrong Q tile size

* Use simple Q lds descriptor for debuging

* Use more realistic Q lds descriptor

* Add comment & use better variable name

* Make Q lds space not overlapped with others

* Remove unnecessary block_tile_reduce_sync() call

* Move Q load statements

* Move block_sync_lds() right before use

* Re-order instructions

* Remove necessary lambda expression

* Use 8 threads on kMaxSplits direction while doing reduction

* Tiny correction for using 8 threads on kMaxSplits direction for combine kernel

* Padding num_split direction of o_acc tile window to 4x

* Update splitkv combine pipeline design

* Add kN1 back to splitkv combine pipeline problem

* Fix compilation errors

* Add missing template parameter

* Fix wrong splitkv combine kernel name

* Fix wrong origin

* Fix wrong LDS descriptor shape

* Fix sync & reduction logics

* Remove unnecessary static assertions

* Extract tile size computation logics

* Make sure we can reuse padding flags in combine kernels

* Rename variables

* Use OaccDataType in BlockFmhaSplitKVCombinePipelineTileSizes<>

* Remove unnecessary static assertion

* Fix function name typo

* Add constraint on kN1 template parameter

* Hide K tile loading latency in earlier iteration

* Fix wrong splitkv kernel name

* Use s_shuffling to replace p_shuffling which removes the needs of cross-warp reduction

* Rename pipeline

* Fix wrong pipeline name attribute

* Add GetAlignmentQ() for NWarpSShuffle pipeline

* Separate Q tile into dram tile & register tile concepts

* Remove non-squre warp gemm transpose c type alias

* Fallback tile size changes for fmha fwd splitkv

* Remove redundant change

* Refine naming for the S tile

* Use better naming of the S tile dstr (read from lds)

* Share Q lds with K lds

* Tiny change

* Fix with using static_for for passing CI checking

---------

Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
---
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py |   1 +
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  42 +-
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |  85 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |   2 -
 .../core/arch/amd_buffer_addressing.hpp       |   4 +-
 .../core/tensor/static_distributed_tensor.hpp |   1 +
 include/ck_tile/ops/fmha.hpp                  |   2 +
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |   6 +-
 .../fmha_fwd_splitkv_combine_kernel.hpp       |  56 +-
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |   9 +-
 ...lock_fmha_fwd_splitkv_combine_pipeline.hpp |  83 +-
 ...plitkv_combine_pipeline_default_policy.hpp | 173 ++--
 ...litkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp | 794 ++++++++++++++++++
 ...nwarp_sshuffle_qr_ks_vs_default_policy.hpp | 226 +++++
 .../pipeline/block_fmha_pipeline_problem.hpp  |  36 +-
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |  55 +-
 .../ops/fmha/pipeline/tile_fmha_shape.hpp     |   2 -
 ...block_gemm_areg_bsmem_creg_one_warp_v1.hpp |  44 +-
 .../block/block_gemm_areg_bsmem_creg_v2.hpp   |  44 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |  16 +
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    | 303 ++++++-
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 271 ++++++
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |   4 +
 23 files changed, 1987 insertions(+), 272 deletions(-)
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp

diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index f6df44a31..332707eaf 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -119,6 +119,7 @@ PIPELINE_MAP = {
 PIPELINE_ENUM_MAP = {
     "qr" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qr_async" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
+    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
 }
 
 BOOL_MAP = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index eca638784..66814f5a1 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -44,13 +44,12 @@ FMHA_FWD_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
 
 using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
-using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 
 using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
                                       ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
-                                      fmha_warp_tile_{F_idx},
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
                                       ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
-                                      fmha_warp_tile_{F_idx},
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
                                       {F_vlayout}>;
 
 using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
@@ -306,15 +305,19 @@ class FmhaFwdTileSize:
     F_rm1       : int  # number of warps for gemm1 along q seqlen
     F_rn1       : int  # number of warps for gemm1 along head dim v
     F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
-    F_wm        : int  # warp size along m (warp size)
-    F_wn        : int  # warp size along n
-    F_wk        : int  # warp size along k
+    F_wm0       : int  # gemm0 warp size along m
+    F_wn0       : int  # gemm0 warp size along n
+    F_wk0       : int  # gemm0 warp size along k
+    F_wm1       : int  # gemm1 warp size along m
+    F_wn1       : int  # gemm1 warp size along n
+    F_wk1       : int  # gemm1 warp size along k
     F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
     @property
     def name(self) -> str:
         return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
         f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
-        f"_w{self.F_wm}x{self.F_wn}x{self.F_wk}" + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" +\
+        ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
 
 @dataclass
 class FmhaFwdKernel:
@@ -352,9 +355,12 @@ class FmhaFwdKernel:
                 F_rm1           = self.F_tile.F_rm1,
                 F_rn1           = self.F_tile.F_rn1,
                 F_rk1           = self.F_tile.F_rk1,
-                F_wm            = self.F_tile.F_wm,
-                F_wn            = self.F_tile.F_wn,
-                F_wk            = self.F_tile.F_wk,
+                F_wm0           = self.F_tile.F_wm0,
+                F_wn0           = self.F_tile.F_wn0,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
                 F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
                 F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
                 F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
@@ -409,17 +415,17 @@ class FmhaFwdKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1,  2, 1, 1,  32, 32, 16, -1),
-            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
-            ## '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32, 96,   4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 16, -1),
+            '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     2, 1, 1,  2, 1, 1,  32, 32, 32, -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1)
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
         }
     else:
         return None
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index e448902cf..df5b9cecc 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -39,6 +39,7 @@ K0_MAX_SUBMAX_MAP = {
 
 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
+    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS",
     "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
 }
 
@@ -50,13 +51,12 @@ namespace {{
 template <bool kHasUnevenSplits>
 struct kernel_runner {{
 using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
-using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
 
 using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
                                           ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
-                                          fmha_warp_tile,
+                                          ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
                                           ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
-                                          fmha_warp_tile,
+                                          ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
                                           {F_vlayout}>;
 
 using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
@@ -161,9 +161,8 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem<
     typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
     typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
     {F_hdim},
-    {F_bm0},
-    {F_bn1},
     {F_mode},
+    {F_bn1},
     fmha_trait>;
 
 using fmha_pipeline = ck_tile::BlockFmhaFwdSplitKVCombinePipeline<
@@ -177,9 +176,11 @@ using fmha_epilogue =
                                            false, false>>;
 
 using fmha_kernel =
-    ck_tile::FmhaFwdSplitKVCombineKernel<ck_tile::FmhaFwdSplitKVCombineTilePartitioner<{F_bm0}, {F_bn1}>,
-                  fmha_pipeline,
-                  fmha_epilogue>;
+    ck_tile::FmhaFwdSplitKVCombineKernel<
+        ck_tile::FmhaFwdSplitKVCombineTilePartitioner<
+            fmha_pipeline_problem::kM0, fmha_pipeline_problem::kN1>,
+        fmha_pipeline,
+        fmha_epilogue>;
 
 static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
@@ -192,7 +193,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }};
 }}
 
-using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn1},
+using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bn1},
                         {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
 
 #include <iostream>
@@ -250,16 +251,25 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
                 using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                
+                // get combine kernel tile sizes
+                using OaccDataType = typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType;
+                constexpr ck_tile::index_t kM0 = ck_tile::BlockFmhaSplitKVCombinePipelineTileSizes<OaccDataType, /*F_bn1=*/32>::kM0;
+
+                // make sure we can reuse the padding flags in combine kernels
+                static_assert({F_bm0} % kM0 == 0);
+                static_assert({F_bn1} % 32 == 0);
+
                 if (t.has_lse) {{
                     if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{
                         return -1;
                     }} else {{
-                        using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, true, {F_squant}, {F_spad}, {F_dvpad}>;
+                        using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, true, {F_squant}, {F_spad}, {F_dvpad}>;
 
                         return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
                     }}
                 }} else {{
-                    using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, false, {F_squant}, {F_spad}, {F_dvpad}>;
+                    using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, false, {F_squant}, {F_spad}, {F_dvpad}>;
 
                     return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
                 }}
@@ -302,7 +312,7 @@ class FmhaFwdSplitKVApiTrait:
         if self.pipeline_tag == 'qr_async':
             if self.spad == 't' : return 'true' # always support
             else :                return 'true'
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
             if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_q % {self.bm0} == 0'
         else: assert False
@@ -313,7 +323,7 @@ class FmhaFwdSplitKVApiTrait:
         if self.pipeline_tag == 'qr_async':
             if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
             else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
-        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
             if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_k % {self.bn0} == 0'
         else: assert False
@@ -324,7 +334,7 @@ class FmhaFwdSplitKVApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -336,7 +346,7 @@ class FmhaFwdSplitKVApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qr_nwarp_sshuffle']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -447,12 +457,11 @@ class FmhaFwdSplitKVApiPool:
 
 @dataclass
 class FmhaFwdSplitKVCombineTileSize:
-    F_bm0       : int  # tile size along q seqlen
     F_bn1       : int  # tile size along v head_dim
     F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
     @property
     def name(self) -> str:
-        return f"b{self.F_bm0}x{self.F_bn1}" +\
+        return f"b{self.F_bn1}" +\
             ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
 
 @dataclass
@@ -485,9 +494,12 @@ class FmhaFwdSplitKVKernel:
                 F_rm1           = self.F_tile.F_rm1,
                 F_rn1           = self.F_tile.F_rn1,
                 F_rk1           = self.F_tile.F_rk1,
-                F_wm            = self.F_tile.F_wm,
-                F_wn            = self.F_tile.F_wn,
-                F_wk            = self.F_tile.F_wk,
+                F_wm0           = self.F_tile.F_wm0,
+                F_wn0           = self.F_tile.F_wn0,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
                 F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
                 F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
                 F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
@@ -553,7 +565,6 @@ class FmhaFwdSplitKVCombineKernel:
                 F_idx           = self.F_idx,
                 F_hdim          = self.F_hdim,
                 F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
-                F_bm0           = self.F_tile.F_bm0,
                 F_bn1           = self.F_tile.F_bn1,
                 F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
                 F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
@@ -577,17 +588,17 @@ class FmhaFwdSplitKVCombineKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16, -1),
-            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
-            ## '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
-            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
-            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16, -1),
+            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+        ### '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32, -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32, -1)
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
         }
     else:
         return None
@@ -595,17 +606,17 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
 def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : FmhaFwdSplitKVCombineTileSize(16, 16,  -1),
-            '64'  : FmhaFwdSplitKVCombineTileSize(32, 32,  -1),
-            ## '96' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
-            '128' : FmhaFwdSplitKVCombineTileSize(32, 64,  -1),
-            '256' : FmhaFwdSplitKVCombineTileSize(32, 128, -1),
+            '32'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
+        ### '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
     }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdSplitKVCombineTileSize(64, 32,  -1),
-            '128' : FmhaFwdSplitKVCombineTileSize(64, 64,  -1),
-            '256' : FmhaFwdSplitKVCombineTileSize(64, 128, -1),
+            '64'  : FmhaFwdSplitKVCombineTileSize(32,   -1),
+            '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
         }
     else:
         return None
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index aee54b475..0e821ed5d 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -709,7 +709,6 @@ std::string fmha_fwd_splitkv_get_name_();
 template <ck_tile::index_t HDim_,
           typename DataType_,
           bool kIsGroupMode_,
-          ck_tile::index_t kM0_,
           ck_tile::index_t kN1_,
           bool kStoreLse_,
           bool kDoFp8StaticQuant_,
@@ -720,7 +719,6 @@ struct fmha_fwd_splitkv_combine_traits_
     static constexpr ck_tile::index_t HDim  = HDim_;
     using DataType                          = ck_tile::remove_cvref_t<DataType_>;
     static constexpr bool kIsGroupMode      = kIsGroupMode_;
-    static constexpr ck_tile::index_t kM0   = kM0_;
     static constexpr ck_tile::index_t kN1   = kN1_;
     static constexpr bool kStoreLse         = kStoreLse_;
     static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index bebf035e9..107aae551 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -1303,8 +1303,8 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
     static_assert(
         (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
             (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
             (std::is_same<T, int32_t>::value &&
              (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
index 568d618ec..8d2f88af3 100644
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -29,6 +29,7 @@ struct static_distributed_tensor
         remove_cvref_t<decltype(StaticTileDistribution{}.get_ys_to_d_descriptor())>;
 
     static constexpr index_t kThreadElementSpaceSize = ThreadTensorDesc{}.get_element_space_size();
+    static_assert(0 < kThreadElementSpaceSize, "Make sure tile distribution is valid");
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_dimension()
     {
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index e106264ce..7a09e4622 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -29,6 +29,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 3de433d6a..90102a6c6 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -71,7 +71,8 @@ struct FmhaFwdKernel
         using bfs = typename FmhaPipeline::BlockFmhaShape;
         using g0br = typename bfs::Gemm0BlockWarps;
         using g1br = typename bfs::Gemm1BlockWarps;
-        using gwt = typename bfs::Gemm0WarpTile;
+        using g0wt = typename bfs::Gemm0WarpTile;
+        using g1wt = typename bfs::Gemm1WarpTile;
         #define _SS_  std::string
         #define _TS_  std::to_string
         auto pn = [&] () {
@@ -88,7 +89,8 @@ struct FmhaFwdKernel
                     _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
             "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
             "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
-            "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
             (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
index 0bccabdd2..a0adfdc12 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -8,9 +8,11 @@ namespace ck_tile {
 template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdSplitKVCombineKernel
 {
-    using TilePartitioner                = remove_cvref_t<TilePartitioner_>;
-    using FmhaPipeline                   = remove_cvref_t<FmhaPipeline_>;
-    using EpiloguePipeline               = remove_cvref_t<EpiloguePipeline_>;
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using FmhaPipeline     = remove_cvref_t<FmhaPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+
+    static constexpr index_t kNumWarps   = FmhaPipeline::kNumWarps;
     static constexpr index_t kBlockSize  = FmhaPipeline::kBlockSize;
     static constexpr index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
     static_assert(kBlockPerCu > 0);
@@ -50,8 +52,7 @@ struct FmhaFwdSplitKVCombineKernel
         return
             _SS_("fmha_fwd_splitkv_combine_d") + _TS_(FmhaPipeline::kHeadDimV) + "_" + _SS_(t2s<ODataType>::name) +
             "_" + (kIsGroupMode ? "group" : "batch") + "_"
-            "b" + _TS_(FmhaPipeline::kM0) + "x" +
-                    _TS_(FmhaPipeline::kN1) + "_" +
+            "b" + _TS_(FmhaPipeline::kN1) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) +
             _SS_(FmhaPipeline::name) +
             (pn.empty() ? "" : "_" + pn) +
@@ -339,37 +340,56 @@ struct FmhaFwdSplitKVCombineKernel
                 number<FmhaPipeline::kAlignmentOacc>{},
                 number<1>{});
 
+            // read 4 * (kM0, kN1) o_acc tiles simultaneously by 4 warps
             const auto o_acc_dram_view = pad_tensor_view(
                 o_acc_dram_naive,
-                make_tuple(number<1>{}, number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                sequence<false, kPadSeqLenQ, kPadHeadDimV>{});
+                make_tuple(
+                    number<kNumWarps>{}, number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                sequence<true, kPadSeqLenQ, kPadHeadDimV>{});
 
+            const index_t padded_num_splits =
+                o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<0>{}];
             const index_t padded_seqlen_q =
                 o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<1>{}];
             const index_t padded_hdim_v =
                 o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<2>{}];
 
-            return transform_tensor_view(
+            const index_t num_m_tiles = integer_divide_floor(padded_seqlen_q, FmhaPipeline::kM0);
+
+            // transform tensor view by following steps, given shape: (padded_num_splits,
+            // padded_seqlen_q, padded_hdim_v)
+            //     1. unmerge to (padded_num_splits, num_m_tiles, kM0, padded_hdim_v)
+            //     2. transpose to (num_m_tiles, padded_num_splits, kM0, padded_hdim_v)
+            //     3. merge to (num_m_tiles * padded_num_splits * kM0, padded_hdim_v)
+            auto transposed = transform_tensor_view(
                 o_acc_dram_view,
-                make_tuple(make_merge_transform(make_tuple(kargs.num_splits, padded_seqlen_q)),
+                make_tuple(make_pass_through_transform(padded_num_splits),
+                           make_unmerge_transform(make_tuple(num_m_tiles, FmhaPipeline::kM0)),
                            make_pass_through_transform(padded_hdim_v)),
-                make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}, sequence<3>{}));
+
+            return transform_tensor_view(
+                transposed,
+                make_tuple(make_merge_transform(
+                               make_tuple(num_m_tiles, padded_num_splits, FmhaPipeline::kM0)),
+                           make_pass_through_transform(padded_hdim_v)),
+                make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),
                 make_tuple(sequence<0>{}, sequence<1>{}));
         }();
 
         auto lse_acc_dram_window = make_tile_window(
             lse_acc_dram,
-            [&]() {
-                return make_tuple(number<FmhaPipeline::kMaxSplits>{}, number<FmhaPipeline::kM0>{});
-            }(),
+            make_tuple(number<FmhaPipeline::kMaxSplits>{}, number<FmhaPipeline::kM0>{}),
             {0, i_m0});
 
+        const index_t padded_num_splits =
+            integer_divide_ceil(kargs.num_splits, kNumWarps) * kNumWarps;
+
         auto o_acc_dram_window = make_tile_window(
             o_acc_dram,
-            [&]() {
-                return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{});
-            }(),
-            {i_m0, i_n1});
+            make_tuple(number<kNumWarps * FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+            {i_tile_m * padded_num_splits * FmhaPipeline::kM0, i_n1});
 
         // LSE DRAM window
         auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
@@ -410,7 +430,6 @@ struct FmhaFwdSplitKVCombineKernel
                     identity{},                                          // lse_element_func
                     composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
                     kargs.num_splits,
-                    kargs.seqlen_q,
                     smem_ptr);
             }
             else
@@ -419,7 +438,6 @@ struct FmhaFwdSplitKVCombineKernel
                                       o_acc_dram_window,
                                       lse_dram_window,
                                       kargs.num_splits,
-                                      kargs.seqlen_q,
                                       smem_ptr);
             }
         }();
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index f37e676da..dc1748726 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -45,6 +45,7 @@ struct FmhaFwdSplitKVKernel
     static constexpr bool kPadHeadDimQ      = FmhaPipeline::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV      = FmhaPipeline::kPadHeadDimV;
     static constexpr auto BiasEnum          = FmhaPipeline::BiasEnum;
+    static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
     static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
     static constexpr bool kIsPagedKV        = FmhaPipeline::Problem::kIsPagedKV;
 
@@ -67,7 +68,8 @@ struct FmhaFwdSplitKVKernel
         using bfs = typename FmhaPipeline::BlockFmhaShape;
         using g0br = typename bfs::Gemm0BlockWarps;
         using g1br = typename bfs::Gemm1BlockWarps;
-        using gwt = typename bfs::Gemm0WarpTile;
+        using g0wt = typename bfs::Gemm0WarpTile;
+        using g1wt = typename bfs::Gemm1WarpTile;
         #define _SS_  std::string
         #define _TS_  std::to_string
         auto pn = [&] () {
@@ -84,11 +86,12 @@ struct FmhaFwdSplitKVKernel
                     _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
             "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
             "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
-            "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
             (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) + 
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" );
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kStoreLSE ? "_lse" : "" ) + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" );
         #undef _SS_
         #undef _TS_
         // clang-format on
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
index 7c49fce99..7ac86e6d1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
@@ -53,6 +53,7 @@ struct BlockFmhaFwdSplitKVCombinePipeline
     using OaccDataType = remove_cvref_t<typename Problem::OaccDataType>;
     using ODataType    = remove_cvref_t<typename Problem::ODataType>;
 
+    static constexpr index_t kNumWarps  = Problem::kNumWarps;
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
     static constexpr index_t kHeadDimV = Problem::kHeadDimV;
@@ -117,7 +118,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline
                const LSEElementFunction& lse_element_func,
                const OaccElementFunction& o_acc_element_func,
                index_t num_splits,
-               index_t seqlen_q,
                void* smem_ptr) const
     {
         // lse_acc tile in LDS
@@ -143,11 +143,12 @@ struct BlockFmhaFwdSplitKVCombinePipeline
         // copy lse_acc tile (shape=[kMaxSplits, kM0]) to LDS (shape=[kMaxSplits, kM0]).
         auto lse_acc_tile = load_tile(lse_acc_dram_window);
         store_tile(lse_acc_lds_write_window, lse_acc_tile);
-        block_sync_lds();
 
         auto lse_accum = make_static_distributed_tensor<LSEDataType>(
             Policy::template MakeLSEaccRegTileDistribution<Problem>());
 
+        __builtin_amdgcn_sched_barrier(0);
+        block_sync_lds();
         // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, kMaxSplits])
         // and fill up -INF values outside the [kM0, num_splits] region.
         {
@@ -264,46 +265,94 @@ struct BlockFmhaFwdSplitKVCombinePipeline
                 }
             });
         }
-        block_sync_lds();
 
         if constexpr(kStoreLSE)
         {
             store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse_logsum));
         }
 
-        auto o_acc_dist = Policy::template MakeOaccDramTileDistribution<Problem>();
-        auto o_acc_dram_window =
+        auto o_acc_4_dist = Policy::template MakeOacc4DramTileDistribution<Problem>();
+        auto o_acc_4_dram_window =
             make_tile_window(o_acc_dram_block_window_tmp.get_bottom_tensor_view(),
                              o_acc_dram_block_window_tmp.get_window_lengths(),
                              o_acc_dram_block_window_tmp.get_window_origin(),
-                             o_acc_dist);
-        auto o_acc = make_static_distributed_tensor<OaccDataType>(o_acc_dist);
-        clear_tile(o_acc);
+                             o_acc_4_dist);
 
-        const index_t padded_seqlen_q = integer_divide_ceil(seqlen_q, kM0) * kM0;
+        // shape=[4 * KM0, kN1]
+        auto o_acc_4 = make_static_distributed_tensor<OaccDataType>(o_acc_4_dist);
+        clear_tile(o_acc_4);
 
-        for(index_t i_split = 0; i_split < num_splits; ++i_split)
+        const index_t padded_num_splits = integer_divide_ceil(num_splits, kNumWarps) * kNumWarps;
+
+        __builtin_amdgcn_sched_barrier(0);
+        block_sync_lds();
+        // each warp handles a [KM0, kN1] tile
+        for(index_t split_start = 0; split_start < padded_num_splits; split_start += kNumWarps)
         {
-            auto o_tile = load_tile(o_acc_dram_window);
+            auto o_tile             = load_tile(o_acc_4_dram_window);
+            const index_t i_split   = split_start + get_warp_id();
+            const index_t row_start = kM0 * get_warp_id();
             {
-                constexpr auto spans = decltype(o_acc)::get_distributed_spans();
+                constexpr auto spans = decltype(o_acc_4)::get_distributed_spans();
                 sweep_tile_span(spans[number<0>{}], [&](auto idx0) {
                     sweep_tile_span(spans[number<1>{}], [&](auto idx1) {
                         constexpr auto i_j_idx = make_tuple(idx0, idx1);
                         const auto x_indices   = get_x_indices_from_distributed_indices(
-                            o_acc.get_tile_distribution(), i_j_idx);
+                            o_acc_4.get_tile_distribution(), i_j_idx);
 
                         const auto row = x_indices.at(number<0>{});
 
-                        const LSEDataType lse_scale = lse_acc_lds(row, i_split);
-                        o_acc(i_j_idx) += lse_scale * o_tile(i_j_idx);
+                        const LSEDataType lse_scale = lse_acc_lds(row - row_start, i_split);
+                        o_acc_4(i_j_idx) += lse_scale * o_tile(i_j_idx);
                     });
                 });
             }
 
-            move_tile_window(o_acc_dram_window, {padded_seqlen_q, 0});
+            move_tile_window(o_acc_4_dram_window, {kNumWarps * kM0, 0});
+        }
+
+        // 4 o_acc tiles in LDS. shape=[4 * kM0, kN1]
+        OaccDataType* o_acc_4_lds_ptr = static_cast<OaccDataType*>(static_cast<void*>(
+            static_cast<char*>(smem_ptr) + Policy::template GetSmemSizeLSEacc<Problem>()));
+
+        {
+            auto o_acc_4_lds_window = [&]() {
+                auto desc = Policy::template MakeOacc4LdsBlockDescriptor<Problem>();
+                auto view = make_tensor_view<address_space_enum::lds>(o_acc_4_lds_ptr, desc);
+                return make_tile_window(view, desc.get_lengths(), {0, 0});
+            }();
+            store_tile(o_acc_4_lds_window, o_acc_4);
         }
 
+        auto o_acc_dist = Policy::template MakeOaccDramTileDistribution<Problem>();
+
+        auto o_acc_4_lds_window = [&]() {
+            auto desc = Policy::template MakeOacc4LdsBlockDescriptor<Problem>();
+            auto view = make_tensor_view<address_space_enum::lds>(o_acc_4_lds_ptr, desc);
+            return make_tile_window(view, desc.get_lengths(), {0, 0}, o_acc_dist);
+        }();
+
+        auto o_acc = make_static_distributed_tensor<OaccDataType>(o_acc_dist);
+        clear_tile(o_acc);
+
+        __builtin_amdgcn_sched_barrier(0);
+        block_sync_lds();
+        static_for<0, kNumWarps, 1>{}([&](auto) {
+            auto o_acc_in = load_tile(o_acc_4_lds_window);
+
+            {
+                constexpr auto spans = decltype(o_acc)::get_distributed_spans();
+                sweep_tile_span(spans[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(spans[number<1>{}], [&](auto idx1) {
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                        o_acc(i_j_idx) += o_acc_in(i_j_idx);
+                    });
+                });
+            }
+
+            move_tile_window(o_acc_4_lds_window, {kM0, 0});
+        });
+
         o_acc = tile_elementwise_in(o_acc_element_func, o_acc);
 
         return o_acc;
@@ -316,7 +365,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline
                                         const OaccDramBlockWindow& o_acc_dram_block_window,
                                         LSEDramBlockWindow& lse_dram_block_window,
                                         index_t num_splits,
-                                        index_t seqlen_q,
                                         void* smem_ptr) const
     {
         return operator()(lse_acc_dram_block_window,
@@ -325,7 +373,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline
                           identity{},
                           identity{},
                           num_splits,
-                          seqlen_q,
                           smem_ptr);
     }
 };
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
index ebd69c0cf..2d4abb388 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
@@ -10,23 +10,38 @@ namespace ck_tile {
 
 struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
 {
-    template <index_t BlockSize, index_t M, index_t N, typename DataType>
+    template <index_t NumWarps, index_t M, index_t N, typename DataType>
+    CK_TILE_HOST_DEVICE static constexpr auto GetMaxNumWarpsForTile()
+    {
+        static_assert(NumWarps == 1 || NumWarps == 2 || NumWarps == 4);
+
+        constexpr index_t ElemPerThread = (M * N) / (NumWarps * get_warp_size());
+        if constexpr(0 < ElemPerThread)
+        {
+            return NumWarps;
+        }
+        else
+        { // try dividing tile by smaller # of warps
+            return GetMaxNumWarpsForTile<NumWarps / 2, M, N, DataType>();
+        }
+    }
+
+    template <index_t NumWarps, index_t M, index_t N, typename DataType>
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeForTile()
     {
-        constexpr index_t PixelsPerThread = (M * N) / BlockSize;
-        static_assert(0 < PixelsPerThread);
+        constexpr index_t MaxNumWarps = GetMaxNumWarpsForTile<NumWarps, M, N, DataType>();
 
-        constexpr index_t MaxNPerThread = 16 / sizeof(DataType);
-        constexpr index_t NPerThread    = min(MaxNPerThread, PixelsPerThread);
+        constexpr index_t ElemPerThread = (M * N) / (MaxNumWarps * get_warp_size());
 
-        return NPerThread;
+        constexpr index_t MaxNPerThread = 16 / sizeof(DataType);
+        return min(MaxNPerThread, ElemPerThread);
     }
 
     // alignment for dram lse tile (shape=[kMaxSplits, kM0])
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentLSE()
     {
-        return GetVectorSizeForTile<Problem::kBlockSize,
+        return GetVectorSizeForTile<Problem::kNumWarps,
                                     Problem::kMaxSplits,
                                     Problem::kM0,
                                     typename Problem::LSEDataType>();
@@ -56,40 +71,54 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeLSEacc()
     {
         return sizeof(typename Problem::LSEDataType) *
                MakeLSEaccLdsBlockDescriptor<Problem>().get_element_space_size();
     }
 
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeOacc4()
+    {
+        return sizeof(typename Problem::OaccDataType) *
+               MakeOacc4LdsBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return GetSmemSizeLSEacc<Problem>() + GetSmemSizeOacc4<Problem>();
+    }
+
     // shape=[kMaxSplits, kM0]
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccDramTileDistribution()
     {
         using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
 
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kNumWarps  = Problem::kNumWarps;
-
-        constexpr index_t kNPerBlock = Problem::kM0;
         constexpr index_t kMPerBlock = Problem::kMaxSplits;
+        constexpr index_t kNPerBlock = Problem::kM0;
+
+        constexpr index_t MaxNumWarps =
+            GetMaxNumWarpsForTile<Problem::kNumWarps, kNPerBlock, kMPerBlock, LSEDataType>();
+        constexpr index_t Replicate = Problem::kNumWarps / MaxNumWarps;
 
         constexpr index_t NPerThread =
-            GetVectorSizeForTile<kBlockSize, kMPerBlock, kNPerBlock, LSEDataType>();
+            GetVectorSizeForTile<MaxNumWarps, kMPerBlock, kNPerBlock, LSEDataType>();
         constexpr index_t NThreads = kNPerBlock / NPerThread;
 
         constexpr index_t MThreadsPerWarp = get_warp_size() / NThreads;
-        constexpr index_t MPerThread      = kMPerBlock / (kNumWarps * MThreadsPerWarp);
+        constexpr index_t MPerThread      = kMPerBlock / (MaxNumWarps * MThreadsPerWarp);
 
+        static_assert(MPerThread * MaxNumWarps * MThreadsPerWarp == kMPerBlock);
         static_assert(NThreads * NPerThread == kNPerBlock);
-        static_assert(MPerThread * kNumWarps * MThreadsPerWarp == kMPerBlock);
 
         return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<MPerThread, kNumWarps, MThreadsPerWarp>,
+            tile_distribution_encoding<sequence<Replicate>,
+                                       tuple<sequence<MPerThread, MaxNumWarps, MThreadsPerWarp>,
                                              sequence<NThreads, NPerThread>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       tuple<sequence<0, 1>, sequence<1, 2>>,
+                                       tuple<sequence<0, 1>, sequence<2, 0>>,
                                        sequence<1, 2>,
                                        sequence<0, 1>>{});
     }
@@ -100,17 +129,15 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
     {
         using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
 
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kMPerBlock = Problem::kMaxSplits;
-        constexpr index_t kNPerBlock = Problem::kM0;
+        constexpr index_t kMPerBlock = Problem::kM0;
+        constexpr index_t kNPerBlock = Problem::kMaxSplits;
         constexpr index_t NPack =
-            GetVectorSizeForTile<kBlockSize, kMPerBlock, kNPerBlock, LSEDataType>();
+            GetVectorSizeForTile<Problem::kNumWarps, kMPerBlock, kNPerBlock, LSEDataType>();
 
         constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kNPerBlock / NPack>{}, number<kMPerBlock>{}, number<NPack>{}),
             make_tuple(number<(kMPerBlock + 1) * NPack>{}, number<NPack>{}, number<1>{}),
-            number<8>{},
+            number<NPack>{},
             number<1>{});
 
         constexpr auto lse_acc_lds_block_desc = transform_tensor_descriptor(
@@ -129,17 +156,15 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
     {
         using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
 
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kMPerBlock = Problem::kMaxSplits;
-        constexpr index_t kNPerBlock = Problem::kM0;
+        constexpr index_t kMPerBlock = Problem::kM0;
+        constexpr index_t kNPerBlock = Problem::kMaxSplits;
         constexpr index_t NPack =
-            GetVectorSizeForTile<kBlockSize, kMPerBlock, kNPerBlock, LSEDataType>();
+            GetVectorSizeForTile<Problem::kNumWarps, kMPerBlock, kNPerBlock, LSEDataType>();
 
         constexpr auto lse_acc_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kNPerBlock / NPack>{}, number<kMPerBlock>{}, number<NPack>{}),
             make_tuple(number<(kMPerBlock + 1) * NPack>{}, number<NPack>{}, number<1>{}),
-            number<8>{},
+            number<NPack>{},
             number<1>{});
 
         constexpr auto lse_acc_t_lds_block_desc = transform_tensor_descriptor(
@@ -152,33 +177,86 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
         return lse_acc_t_lds_block_desc;
     }
 
+    // 3d + padding, shape=[4 * kM0, kN1]
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccRegTileDistribution()
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOacc4LdsBlockDescriptor()
     {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
+        using LSEDataType = remove_cvref_t<typename Problem::LSEDataType>;
 
-        constexpr index_t kNPerBlock = Problem::kMaxSplits;
+        constexpr index_t kMPerBlock = 4 * Problem::kM0;
+        constexpr index_t kNPerBlock = Problem::kN1;
+        constexpr index_t NPack =
+            GetVectorSizeForTile<Problem::kNumWarps, kMPerBlock, kNPerBlock, LSEDataType>();
+
+        constexpr auto o_acc_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kNPerBlock / NPack>{}, number<kMPerBlock>{}, number<NPack>{}),
+            make_tuple(number<(kMPerBlock + 1) * NPack>{}, number<NPack>{}, number<1>{}),
+            number<8>{},
+            number<1>{});
+
+        constexpr auto o_acc_t_lds_block_desc = transform_tensor_descriptor(
+            o_acc_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(kMPerBlock),
+                       make_merge_transform(make_tuple(kNPerBlock / NPack, NPack))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<1>{}, sequence<0>{}));
+
+        return o_acc_t_lds_block_desc;
+    }
+
+    // shape=[kM0, kMaxSplits]
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLSEaccRegTileDistribution()
+    {
         constexpr index_t kMPerBlock = Problem::kM0;
+        constexpr index_t kNPerBlock = Problem::kMaxSplits;
 
-        constexpr index_t NThreads   = 4;
-        constexpr index_t NPerThread = kNPerBlock / NThreads;
+        constexpr index_t MaxNThreads = 8;
+        constexpr index_t NThreads    = min(kNPerBlock, MaxNThreads);
+        constexpr index_t NPerThread  = kNPerBlock / NThreads;
 
-        constexpr index_t MThreads       = kBlockSize / NThreads;
-        constexpr index_t MPerThread     = kMPerBlock / MThreads;
-        constexpr index_t MWarps         = kBlockSize / get_warp_size();
+        constexpr index_t MPerThread     = 1;
+        constexpr index_t MThreads       = kMPerBlock / MPerThread;
         constexpr index_t MThreadPerWarp = get_warp_size() / NThreads;
 
+        constexpr index_t MaxNumWarps = (MThreads * NThreads) / get_warp_size();
+        constexpr index_t Replicate   = Problem::kNumWarps / MaxNumWarps;
+
+        static_assert(MaxNumWarps * MThreadPerWarp * MPerThread == kMPerBlock);
         static_assert(NThreads * NPerThread == kNPerBlock);
-        static_assert(MWarps * MThreadPerWarp * MPerThread == kMPerBlock);
 
         return make_static_tile_distribution(
-            tile_distribution_encoding<
-                sequence<1>,
-                tuple<sequence<MWarps, MThreadPerWarp, MPerThread>, sequence<NThreads, NPerThread>>,
-                tuple<sequence<1>, sequence<2, 1>>,
-                tuple<sequence<0>, sequence<0, 1>>,
-                sequence<1, 2>,
-                sequence<2, 1>>{});
+            tile_distribution_encoding<sequence<Replicate>,
+                                       tuple<sequence<MaxNumWarps, MThreadPerWarp, MPerThread>,
+                                             sequence<NThreads, NPerThread>>,
+                                       tuple<sequence<0, 1>, sequence<2, 1>>,
+                                       tuple<sequence<0, 0>, sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<2, 1>>{});
+    }
+
+    // similar to MakeOaccDramTileDistribution(), but duplicate same 1-warp encoding 4 times on M
+    // direction
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOacc4DramTileDistribution()
+    {
+        constexpr index_t kMPerBlock = Problem::kM0; // real kMPerBlock we want is (4 * kM0)
+        constexpr index_t kNPerBlock = Problem::kN1;
+        static_assert(get_warp_size() <= kMPerBlock * kNPerBlock);
+
+        constexpr index_t M1 = 1; // compose encoding base on 1 warp
+        constexpr index_t M2 = min(kMPerBlock / M1, get_warp_size());
+        constexpr index_t N0 = get_warp_size() / M2;
+        constexpr index_t N1 = kNPerBlock / N0;
+        constexpr index_t M0 = kMPerBlock / (M2 * M1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<4, M0, M1, M2>, sequence<N0, N1>>,
+                                       tuple<sequence<1, 1>, sequence<1, 2>>,
+                                       tuple<sequence<0, 2>, sequence<3, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<1, 1>>{});
     }
 
     template <typename Problem>
@@ -187,6 +265,7 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
         constexpr index_t kBlockSize = Problem::kBlockSize;
         constexpr index_t kMPerBlock = Problem::kM0;
         constexpr index_t kNPerBlock = Problem::kN1;
+        static_assert(kBlockSize <= kMPerBlock * kNPerBlock);
 
         constexpr index_t M1 = kBlockSize / get_warp_size();
         constexpr index_t M2 = min(kMPerBlock / M1, get_warp_size());
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
new file mode 100644
index 000000000..3726cd433
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+// This pipeline is qkv all located in LDS
+template <typename Problem_,
+          typename Policy_ = BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy>
+struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
+{
+    using Problem             = remove_cvref_t<Problem_>;
+    using Policy              = remove_cvref_t<Policy_>;
+    using QDataType           = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType           = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType           = remove_cvref_t<typename Problem::VDataType>;
+    using SaccDataType        = remove_cvref_t<typename Problem::SaccDataType>;
+    using SMPLComputeDataType = remove_cvref_t<typename Problem::SMPLComputeDataType>;
+    using BiasDataType        = remove_cvref_t<typename Problem::BiasDataType>;
+    using LSEDataType         = remove_cvref_t<typename Problem::LSEDataType>;
+    using PDataType           = remove_cvref_t<typename Problem::PDataType>;
+    using OaccDataType        = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType           = remove_cvref_t<typename Problem::ODataType>;
+    using FmhaMask            = remove_cvref_t<typename Problem::FmhaMask>;
+
+    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
+    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
+    static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
+    static_assert(kQLoadOnce == Policy::QLoadOnce);
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
+
+    static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
+    static constexpr auto BiasEnum         = Problem::BiasEnum;
+    static constexpr bool kStoreLSE        = Problem::kStoreLSE;
+    static constexpr bool kIsPagedKV       = Problem::kIsPagedKV;
+    static constexpr bool kHasUnevenSplits = Problem::kHasUnevenSplits;
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV = []() {
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            return kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+        else
+            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
+    }();
+
+    static constexpr index_t kAlignmentOacc =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentOacc<Problem>();
+
+    static constexpr index_t kAlignmentBias =
+        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            if constexpr(kQKHeaddim <= 32)
+            {
+                return 2;
+            }
+            else if constexpr(kQKHeaddim <= 64)
+            {
+                return 3;
+            }
+            else if constexpr(kQKHeaddim <= 128)
+            {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 256)
+            {
+                return 1;
+            }
+        }
+    }();
+
+    static constexpr const char* name = "qr_nwarp_sshuffle";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowLengths,
+              typename KPageBlockNavigator,
+              typename VDramBlockWindowLengths,
+              typename VPageBlockNavigator,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename QElementFunction,
+              typename KElementFunction,
+              typename VElementFunction,
+              typename BiasElementFunction,
+              typename LSEaccElementFunction,
+              typename SAccElementFunction,
+              typename PComputeElementFunction,
+              typename OAccElementFunction,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
+               const QElementFunction& q_element_func,
+               const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile
+               const KPageBlockNavigator& k_page_block_navigator,
+               const KElementFunction& k_element_func,
+               const VDramBlockWindowLengths& v_dram_block_window_lengths, // N1*K1 tile
+               const VPageBlockNavigator& v_page_block_navigator,
+               const VElementFunction& v_element_func,
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               const BiasElementFunction& bias_element_func,
+               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp, // M0*1 tile
+               const LSEaccElementFunction& lse_acc_element_func,
+               const SAccElementFunction& s_acc_element_func,
+               const PComputeElementFunction& p_compute_element_func,
+               const OAccElementFunction& o_acc_element_func,
+               index_t num_splits,
+               index_t i_split,
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
+               void* smem_ptr) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KPageBlockNavigator::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VPageBlockNavigator::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kSubQKHeaddim ==
+                              QDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] &&
+                          kN0 == KDramBlockWindowLengths{}[number<0>{}] &&
+                          kK0 == KDramBlockWindowLengths{}[number<1>{}] &&
+                          kN1 == VDramBlockWindowLengths{}[number<0>{}] &&
+                          kK1 == VDramBlockWindowLengths{}[number<1>{}] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+        // Q tile in LDS
+        QDataType* q_lds_ptr =
+            static_cast<QDataType*>(static_cast<void*>(static_cast<char*>(smem_ptr)));
+        auto q_lds = make_tensor_view<address_space_enum::lds>(
+            q_lds_ptr, Policy::template MakeQLdsBlockDescriptor<Problem>());
+
+        // K tile in LDS
+        KDataType* k_lds_ptr =
+            static_cast<KDataType*>(static_cast<void*>(static_cast<char*>(smem_ptr)));
+        auto k_lds = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsBlockDescriptor<Problem>());
+        auto k_lds_window =
+            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0});
+
+        // V tile in LDS
+        auto v_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         max(Policy::template GetSmemSizeQ<Problem>(),
+                                             Policy::template GetSmemSizeK<Problem>())),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+        auto v_lds_window = make_tile_window(
+            v_lds, Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+
+        // S tile in LDS
+        auto s_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptr) +
+                                            max(Policy::template GetSmemSizeQ<Problem>(),
+                                                Policy::template GetSmemSizeK<Problem>())),
+            Policy::template MakeSLdsBlockDescriptor<Problem>());
+        auto s_write_lds_window = make_tile_window(
+            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+        auto s_read_lds_window =
+            make_tile_window(s_lds,
+                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeSRegTileDistribution<Problem>());
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
+
+        auto q_dram_window =
+            make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
+                             q_dram_block_window_tmp.get_window_lengths(),
+                             q_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeQDramTileDistribution<Problem>());
+
+        // load Q here, will store Q into LDS to maximize throughput
+        auto origin_q = load_tile(q_dram_window);
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        auto o_acc = OaccBlockTileType{};
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        // init M, L
+        auto m = MLBlockTileType{};
+        auto l = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_window.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] = mask.GetTileRangeAlongX(
+            q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{}, num_splits, i_split);
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
+        {
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse_acc =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
+
+                    if(get_thread_local_1d_id() < kM0)
+                    {
+                        store_tile(lse_acc_dram_window_tmp,
+                                   tile_elementwise_in(lse_acc_element_func, lse_acc));
+                    }
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start + kv_l2p_offset;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end + kv_l2p_offset;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start =
+            [&, physical_seqlen_k_start_ = physical_seqlen_k_start] {
+                if constexpr(kIsPagedKV)
+                {
+                    return kN0 * integer_divide_floor(physical_seqlen_k_start_, kN0);
+                }
+                else
+                {
+                    return physical_seqlen_k_start_;
+                }
+            }();
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window(
+            k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0});
+
+        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+        auto bias_dram_window =
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}),
+                              logical_seqlen_k_start - (physical_seqlen_k_start -
+                                                        aligned_physical_seqlen_k_start)}, // M/N
+                             Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
+
+        auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
+            v_dram_block_window_lengths,
+            {0, aligned_physical_seqlen_k_start}, // TODO: hdim split?
+            Policy::template MakeVDramTileDistribution<Problem>());
+
+        // store Q into LDS
+        __builtin_amdgcn_sched_barrier(0);
+        auto q_lds_window_for_store = make_tile_window(
+            q_lds, Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+
+        store_tile(q_lds_window_for_store, origin_q);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // load Q from LDS
+        __builtin_amdgcn_sched_barrier(0);
+        auto q_lds_window_for_load = make_tile_window(
+            q_lds,
+            Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+            {0, 0},
+            Policy::template MakeQRegTileDistribution<Problem, decltype(gemm_0)>());
+        block_sync_lds();
+        auto q = load_tile(q_lds_window_for_load);
+        __builtin_amdgcn_sched_barrier(0);
+        auto q_tile = tile_elementwise_in(q_element_func, q);
+
+        // prefetch K tile
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(2 <= k0_loops);
+        static_assert(1 <= k1_loops);
+
+        auto k_dram_window = make_tile_window(
+            k_dram_block_window,
+            Policy::template MakeKDramTileDistribution<Problem>()); // K DRAM tile window for
+
+        // load the first tile of the first iteration and store to LDS
+        auto k_block_tile = load_tile(k_dram_window);
+        // moving k_dram_window is an in-page-block operation, so there is
+        // no need to invoke k_page_block_navigator.move_tile_window() here.
+        move_tile_window(k_dram_window, {0, kK0});
+        store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
+
+        do
+        {
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
+            // load the second tile of the first iteration
+            k_block_tile = load_tile(k_dram_window);
+
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                __builtin_amdgcn_sched_barrier(
+                    0); // prevent from messing up the order of global loads
+            }
+            const auto bias_tile = load_tile(bias_dram_window); // load bias tile
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                __builtin_amdgcn_sched_barrier(
+                    0); // prevent from messing up the order of global loads
+            }
+
+            if constexpr(k0_loops > 2)
+            {
+                static_for<0, k0_loops - 2, 1>{}([&](auto i_k0) {
+                    block_sync_lds();
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_lds_window);
+                    block_sync_lds();
+                    move_tile_window(k_dram_window, {0, kK0});
+
+                    store_tile(
+                        k_lds_window,
+                        tile_elementwise_in(k_element_func, k_block_tile)); // LDS write i + 1
+                    k_block_tile = load_tile(k_dram_window);                // global read i + 2
+                });
+            }
+
+            const auto v_prefetch = load_tile(v_dram_window); // prefetch load v tile
+            {                                                 // tail
+                block_sync_lds();
+                gemm_0(s_acc,
+                       get_slice_tile(q_tile,
+                                      sequence<0, (k0_loops - 2) * kK0>{},
+                                      sequence<kM0, (k0_loops - 1) * kK0>{}),
+                       k_lds_window);
+                block_sync_lds();
+
+                store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
+                block_sync_lds();
+
+                gemm_0(s_acc,
+                       get_slice_tile(q_tile,
+                                      sequence<0, (k0_loops - 1) * kK0>{},
+                                      sequence<kM0, k0_loops * kK0>{}),
+                       k_lds_window);
+            }
+
+            // STAGE 2, scale_s, add bias, mask, softmax
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                tile_elementwise_inout(
+                    [&](auto& x, const auto& y) {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                        x += type_convert<SaccDataType>(bias_element_func(y));
+#else
+                        x += log2e_v<SaccDataType> *
+                             type_convert<SaccDataType>(bias_element_func(y));
+#endif
+                    },
+                    s_acc,
+                    bias_tile);
+            }
+            else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+            {
+                const auto k_origin = k_page_block_navigator.to_global_window_origin(
+                    i_page_block_k, k_dram_block_window.get_window_origin());
+                constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
+                s_acc                  = tile_elementwise_in(s_acc_element_func, s_acc);
+                sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
+                        const auto tile_idx = get_x_indices_from_distributed_indices(
+                            s_acc.get_tile_distribution(), make_tuple(idx0, idx1));
+
+                        const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                        const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                        s_acc(i_j_idx) *= scale_s;
+                        // position_encoding accept only logical coordinates, do conversion here
+                        position_encoding.update(s_acc(i_j_idx), row, col - kv_l2p_offset);
+                    });
+                });
+            }
+            else
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+            }
+            move_tile_window(bias_dram_window, {0, kN0});
+
+            /// TODO: only check in first/last iteration without increasing code size
+            if constexpr(kHasUnevenSplits)
+            {
+                const auto k_origin = k_page_block_navigator.to_global_window_origin(
+                    i_page_block_k, k_dram_block_window.get_window_origin());
+                set_tile_if(
+                    s_acc,
+                    -numeric<SMPLComputeDataType>::infinity(),
+                    [&,
+                     physical_seqlen_k_start_ = physical_seqlen_k_start,
+                     physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                        const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                        if constexpr(kIsPagedKV)
+                        {
+                            return col < physical_seqlen_k_start_ || physical_seqlen_k_end_ <= col;
+                        }
+                        else
+                        {
+                            return physical_seqlen_k_end_ <= col;
+                        }
+                    });
+            }
+
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = k_page_block_navigator.to_global_window_origin(
+                    i_page_block_k, k_dram_block_window.get_window_origin());
+                // mask accept only logical coordinates, do conversion here
+                bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}),
+                                                           k_origin.at(number<0>{}) - kv_l2p_offset,
+                                                           number<kM0>{},
+                                                           number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                            return mask.IsOutOfBound(row, col - kv_l2p_offset);
+                        });
+                }
+            }
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            // load the first tile for next iteration
+            if(i_total_loops < num_total_loop - 1)
+            {
+                // move K tile windows
+                i_page_block_k = k_page_block_navigator.move_tile_window(
+                    i_page_block_k, k_dram_block_window, {kN0, 0});
+
+                k_dram_window = make_tile_window(
+                    k_dram_block_window,
+                    Policy::template MakeKDramTileDistribution<Problem>()); // K DRAM tile window
+
+                // laod the first tile of the first iteration and store to LDS
+                k_block_tile = load_tile(k_dram_window);
+            }
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            const auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+
+            // shuffle through LDS so that the tile layout is consistent with required by Gemm1
+            store_tile(s_write_lds_window, s);
+            block_sync_lds();
+            auto s_new = load_tile(s_read_lds_window);
+
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s_new,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            block_tile_reduce_sync(m_local, f_max, bool_constant<false>{});
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s_new.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                auto row_max = scale_s * get_validated_m(m[i_idx]);
+#endif
+                sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                    }
+#else
+                    p_compute(i_j_idx)     = exp(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+#endif
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(rowsum_p, f_sum, bool_constant<false>{});
+
+            const auto p =
+                cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
+
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                const auto tmp = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        auto row_max = scale_s * get_validated_m(m[i_idx]);
+                        return exp2(scale_s * m_old[i_idx] - row_max);
+                    }
+                }();
+#else
+                const auto tmp       = exp(m_old[i_idx] - get_validated_m(m[i_idx]));
+#endif
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    // FIXME: this use different equation from FA v2 paper,
+                    // but produce correc result.
+                    // Is the equation wrong?
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds();
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                    Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                shuffle_tile(v_shuffle_tmp, v_prefetch);
+                store_tile(
+                    v_lds_window,
+                    tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
+            }
+            else
+            {
+                store_tile(v_lds_window,
+                           tile_elementwise_in(v_element_func, v_prefetch)); // store the prefetch
+            }
+            i_page_block_v =
+                v_page_block_navigator.move_tile_window(i_page_block_v, v_dram_window, {0, kK1});
+
+            // STAGE 3, KV gemm
+            if constexpr(k1_loops > 1)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&,
+                                                  &i_page_block_v_ = i_page_block_v,
+                                                  &v_dram_window_  = v_dram_window](auto i_k1) {
+                    const auto v = load_tile(v_dram_window_); // load next v
+                    block_sync_lds();
+
+                    gemm_1(o_acc,
+                           get_slice_tile(
+                               p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_lds_window);
+                    block_sync_lds();
+
+                    if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                            Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                        shuffle_tile(v_shuffle_tmp, v);
+                        store_tile(v_lds_window,
+                                   tile_elementwise_in(v_element_func,
+                                                       v_shuffle_tmp)); // store the prefetch
+                    }
+                    else
+                    {
+                        store_tile(v_lds_window,
+                                   tile_elementwise_in(v_element_func, v)); // store next v
+                    }
+                    i_page_block_v_ = v_page_block_navigator.move_tile_window(
+                        i_page_block_v_, v_dram_window_, {0, kK1});
+                });
+            }
+
+            // tail
+            {
+                block_sync_lds();
+                gemm_1(o_acc,
+                       get_slice_tile(
+                           p, sequence<0, (k1_loops - 1) * kK1>{}, sequence<kM0, k1_loops * kK1>{}),
+                       v_lds_window);
+                block_sync_lds();
+            }
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            // load the first tile for next iteration
+            if(i_total_loops < num_total_loop - 1)
+            {
+                // store the first tile for next iteration to LDS
+                // moving k_dram_window is an in-page-block operation, so there is
+                // no need to invoke k_page_block_navigator.move_tile_window() here.
+                move_tile_window(k_dram_window, {0, kK0});
+                store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
+            }
+        } while(++i_total_loops < num_total_loop);
+
+        if constexpr(kStoreLSE)
+        {
+            // store lse acc
+            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
+            sweep_tile_span(lse_acc_spans[number<0>{}], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                }
+#else
+                    lse_acc(i_idx) = m_[i_idx] + log(l_[i_idx]);
+#endif
+            });
+
+            if(get_thread_local_1d_id() < kM0)
+            {
+                store_tile(lse_acc_dram_window_tmp,
+                           tile_elementwise_in(lse_acc_element_func, lse_acc));
+            }
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        o_acc = tile_elementwise_in(o_acc_element_func, o_acc);
+
+        return o_acc;
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowLengths,
+              typename KPageBlockNavigator,
+              typename VDramBlockWindowLengths,
+              typename VPageBlockNavigator,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,         // M0*K0 tile
+               const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile
+               const KPageBlockNavigator& k_page_block_navigator,
+               const VDramBlockWindowLengths& v_dram_block_window_lengths, // N1*K1 tile
+               const VPageBlockNavigator& v_page_block_navigator,
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEaccDramBlockWindowTmp& lse_acc_dram_block_window_tmp,  // M0*1 tile
+               index_t num_splits,
+               index_t i_split,
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
+               void* smem_ptr) const
+    {
+        return operator()(q_dram_block_window_tmp,
+                          identity{},
+                          k_dram_block_window_lengths,
+                          k_page_block_navigator,
+                          identity{},
+                          v_dram_block_window_lengths,
+                          v_page_block_navigator,
+                          identity{},
+                          bias_dram_block_window_tmp,
+                          identity{},
+                          lse_acc_dram_block_window_tmp,
+                          identity{},
+                          identity{},
+                          identity{},
+                          identity{},
+                          num_splits,
+                          i_split,
+                          mask,
+                          position_encoding,
+                          scale_s,
+                          kv_l2p_offset,
+                          smem_ptr);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
new file mode 100644
index 000000000..74d755ef3
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
+
+namespace ck_tile {
+
+// This pipeline is qkv all located in LDS
+struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy
+    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                          /* AsyncCopyK = */ false,
+                                          /* AsyncCopyV = */ false,
+                                          /* NumPrefetchK = */ 1,
+                                          /* NumPrefetchV = */ 1>
+{
+    using BasePolicy = BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                                           /* AsyncCopyK = */ false,
+                                                           /* AsyncCopyV = */ false,
+                                                           /* NumPrefetchK = */ 1,
+                                                           /* NumPrefetchV = */ 1>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+        // this should align with MakeQDramTileDistribution()
+        constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc()
+    {
+        using OaccDataType = remove_cvref_t<typename Problem::OaccDataType>;
+
+        return static_cast<index_t>(16 / sizeof(OaccDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+        constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+        constexpr index_t KPerThread     = kMaxVecLoad;
+        constexpr index_t KThreads       = kKPerBlock / KPerThread;
+        constexpr index_t MThreadPerWarp = get_warp_size() / KThreads;
+        constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+        constexpr index_t MPerThread     = kMPerBlock / (MThreadPerWarp * NumWarps);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<MPerThread, NumWarps, MThreadPerWarp>,
+                                             sequence<KThreads, KPerThread>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem, typename BlockGemm>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
+    {
+        return BasePolicy::template MakeQDramTileDistribution<Problem, BlockGemm>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackQ()
+    {
+        // TODO: this is for 3d layout
+        using QDataType = remove_cvref_t<typename Problem::QDataType>;
+        return static_cast<index_t>(16 / sizeof(QDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsBlockDescriptor()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        constexpr index_t kKPack = min(ElemPerThread, GetSmemKPackQ<Problem>());
+
+        constexpr auto q_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
+            make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto q_lds_block_desc = transform_tensor_descriptor(
+            q_lds_block_desc_0,
+            make_tuple(
+                make_pass_through_transform(number<kMPerBlock>{}),
+                make_merge_transform(make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return q_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemNPackS()
+    {
+        using SDataType = remove_cvref_t<typename Problem::SaccDataType>;
+        return static_cast<index_t>(16 / sizeof(SDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSLdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kNPack     = GetSmemNPackS<Problem>();
+
+        constexpr auto s_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kNPerBlock / kNPack>{}, number<kMPerBlock>{}, number<kNPack>{}),
+            make_tuple(number<(kMPerBlock + 1) * kNPack>{}, number<kNPack>{}, number<1>{}),
+            number<kNPack>{},
+            number<1>{});
+
+        constexpr auto s_lds_block_desc = transform_tensor_descriptor(
+            s_lds_block_desc_0,
+            make_tuple(
+                make_pass_through_transform(number<kMPerBlock>{}),
+                make_merge_transform(make_tuple(number<kNPerBlock / kNPack>{}, number<kNPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return s_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSRegTileDistribution()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetKVBlockGemm<Problem>())>;
+
+        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        static_assert(MWarp == 1, "Check failed!");
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+        constexpr index_t kTileK     = Problem::BlockFmhaShape::kN0;
+
+        // K2 is equal to Impl::kABKPerLane * kKIterPerWarpGemm
+        constexpr index_t K3 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K2 = WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K1 = kKPerBlock / (K2 * K3);
+        constexpr index_t K0 = kTileK / kKPerBlock;
+        constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane;
+        constexpr index_t M1 = MWarp;
+        constexpr index_t M0 = kMPerBlock / (M2 * M1);
+
+        constexpr auto s2_block_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2, K3>>,
+                                       tuple<sequence<1, 0>, sequence<2, 1>>,
+                                       tuple<sequence<1, 0>, sequence<2, 2>>,
+                                       sequence<1, 2, 2, 2>,
+                                       sequence<0, 0, 1, 3>>{};
+
+        constexpr auto s2_block_dstr = make_static_tile_distribution(s2_block_dstr_encoding);
+
+        return s2_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeQ()
+    {
+        return MakeQLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::QDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeK()
+    {
+        return MakeKLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::KDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeV()
+    {
+        return MakeVLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::VDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeS()
+    {
+        return MakeSLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::SaccDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return max(GetSmemSizeQ<Problem>(), GetSmemSizeK<Problem>()) +
+               max(GetSmemSizeV<Problem>(), GetSmemSizeS<Problem>());
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index d9da2f088..1fe19faaf 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -106,28 +106,43 @@ struct BlockFmhaFwdSplitKVPipelineProblem
     static constexpr index_t kBlockPerCu    = Traits::kBlockPerCu;
 };
 
+// extract tile size attributes to remove dependency on traits
+template <typename OaccDataType_, ck_tile::index_t kN1_>
+struct BlockFmhaSplitKVCombinePipelineTileSizes
+{
+    static constexpr index_t MaxVectorSize = 16 / sizeof(OaccDataType_);
+
+    static constexpr index_t kN1      = kN1_;
+    static constexpr index_t NThreads = kN1 / MaxVectorSize;
+    static constexpr index_t kM0      = get_warp_size() / NThreads; // MThreadPerWarp
+};
+
 template <typename LSEDataType_,
           typename OaccDataType_,
           typename ODataType_,
           index_t HeadDimV_,
-          index_t kM0_,
-          index_t kN1_,
           bool kIsGroupMode_,
+          ck_tile::index_t kN1_,
           typename Traits_>
 struct BlockFmhaSplitKVCombinePipelineProblem
+    : BlockFmhaSplitKVCombinePipelineTileSizes<OaccDataType_, kN1_>
 {
+    using BaseType = BlockFmhaSplitKVCombinePipelineTileSizes<OaccDataType_, kN1_>;
+
     using LSEDataType  = remove_cvref_t<LSEDataType_>;
     using OaccDataType = remove_cvref_t<OaccDataType_>;
     using ODataType    = remove_cvref_t<ODataType_>;
     using Traits       = remove_cvref_t<Traits_>;
 
-    static constexpr index_t kNumWarps  = kM0_ / (get_warp_size() / 4);
-    static constexpr index_t kBlockSize = kNumWarps * get_warp_size();
-    static constexpr bool kIsGroupMode  = kIsGroupMode_;
+    static_assert(std::is_same_v<LSEDataType, OaccDataType>);
 
     static constexpr index_t kHeadDimV = HeadDimV_;
-    static constexpr index_t kM0       = kM0_;
-    static constexpr index_t kN1       = kN1_;
+    static constexpr bool kIsGroupMode = kIsGroupMode_;
+
+    using BaseType::kM0;
+    using BaseType::kN1;
+
+    static_assert(kN1 <= kHeadDimV && kHeadDimV % kN1 == 0);
 
     // attributes from traits
     static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
@@ -136,6 +151,13 @@ struct BlockFmhaSplitKVCombinePipelineProblem
     static constexpr bool kDoFp8StaticQuant = Traits::kDoFp8StaticQuant;
     static constexpr index_t kBlockPerCu    = Traits::kBlockPerCu;
     static constexpr index_t kMaxSplits     = Traits::kMaxSplits;
+    static_assert(8 <= kMaxSplits);
+
+    static constexpr index_t kNumWarps  = 4; // always use 4 warps for each workgroup
+    static constexpr index_t kBlockSize = kNumWarps * get_warp_size();
+
+    static_assert(get_warp_size() <= (kM0 * kMaxSplits) &&
+                  (kM0 * kMaxSplits) % get_warp_size() == 0);
 };
 
 template <typename QDataType_,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index a3a29bb54..1c9df4644 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -41,52 +41,21 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ()
     {
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
         using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
         constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
         using WG              = remove_cvref_t<decltype(config.template at<0>())>;
-        return WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
+
+        return min(MaxVectorSize, WG::kK / WG::WarpGemmAttribute::Impl::kABKLane);
     }
 
     template <typename Problem, typename BlockGemm>
     CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
     {
-        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp = config.template at<1>();
-
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
-
-        constexpr index_t K2 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
-        constexpr index_t K1 = WG::WarpGemmAttribute::Impl::kABKLane;
-        constexpr index_t K0 = kKPerBlock / (K1 * K2);
-
-        constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane;
-        constexpr index_t M1 = MWarp;
-        constexpr index_t M0 = kMPerBlock / (M2 * M1);
-
-        if constexpr(1 < Problem::kNumGemm0Warps)
-        {
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<1>,
-                                           tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2>>,
-                                           tuple<sequence<1>, sequence<2, 1>>,
-                                           tuple<sequence<1>, sequence<1, 2>>,
-                                           sequence<1, 2, 2>,
-                                           sequence<0, 0, 2>>{});
-        }
-        else
-        {
-            static_assert(MWarp == 1);
-
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<1>,
-                                           tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2>>,
-                                           tuple<sequence<2, 1>>,
-                                           tuple<sequence<1, 2>>,
-                                           sequence<1, 2, 2>,
-                                           sequence<0, 0, 2>>{});
-        }
+        return BlockGemm::template MakeABlockTileDistribution<
+            Problem::BlockFmhaShape::kM0,
+            Problem::BlockFmhaShape::kSubQKHeaddim>();
     }
 
     template <typename Problem>
@@ -105,7 +74,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
 
         constexpr auto warp_gemm = []() {
             constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
-            static_assert(WarpGemmM == 16 || WarpGemmM == 32);
+            static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
 
             if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
                          std::is_same_v<typename Problem::KDataType, half_t> &&
@@ -113,8 +82,10 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
             {
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
-                else // WarpGemmM == 16
+                else if constexpr(WarpGemmM == 16)
                     return WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution{};
+                else // WarpGemmM == 4
+                    return WarpGemmMfmaF16F16F32M4N64K16{};
             }
             else if constexpr(std::is_same_v<typename Problem::QDataType, bf16_t> &&
                               std::is_same_v<typename Problem::KDataType, bf16_t> &&
@@ -122,8 +93,10 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
             {
                 if constexpr(WarpGemmM == 32)
                     return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
-                else // WarpGemmM == 16
+                else if constexpr(WarpGemmM == 16)
                     return WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution{};
+                else // WarpGemmM == 4
+                    return WarpGemmMfmaBf16Bf16F32M4N64K16{};
             }
             else if constexpr(std::is_same_v<typename Problem::QDataType, fp8_t> &&
                               std::is_same_v<typename Problem::KDataType, fp8_t> &&
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index bb33b5f02..5ce80c2d1 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -43,8 +43,6 @@ struct TileFmhaShape
 
     static constexpr index_t NumWarps = max(NumGemm0Warps, NumGemm1Warps);
 
-    static_assert(std::is_same_v<Gemm0WarpTile, Gemm1WarpTile>);
-
     static constexpr index_t kM0 = BlockTile::at(number<0>{}); // tile size along q seqlen
     static constexpr index_t kN0 = BlockTile::at(number<1>{}); // tile size along k seqlen
     static constexpr index_t kK0 = BlockTile::at(number<2>{}); // tile size along qk gemm unroll
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
index ff23f6355..b99466b1e 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
@@ -65,14 +65,6 @@ struct BlockGemmARegBSmemCRegOneWarpV1
 
         const index_t iNWarp = 0;
 
-        constexpr auto a_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<NWarp>,
-                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<1, 0>>,
-                                       tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-
         constexpr auto c_block_outer_dstr_encoding =
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp>>,
@@ -81,19 +73,14 @@ struct BlockGemmARegBSmemCRegOneWarpV1
                                        sequence<1, 2>,
                                        sequence<0, 0>>{};
 
-        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
-
         constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
             c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
 
-        constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode);
-
         // constrcut from A-block-tensor from A-Block-tensor-tmp
         // FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent
         // distribution
-        auto a_block_tensor =
-            make_static_distributed_tensor<typename ABlockTensorTmp::DataType>(a_block_dstr);
+        auto a_block_tensor = make_static_distributed_tensor<typename ABlockTensorTmp::DataType>(
+            MakeABlockTileDistribution());
 
         a_block_tensor.get_thread_buffer() = a_block_tensor_tmp.get_thread_buffer();
 
@@ -187,6 +174,33 @@ struct BlockGemmARegBSmemCRegOneWarpV1
         });
     }
 
+    template <index_t MPerBlock = BlockGemmShape::kM, index_t KPerBlock = BlockGemmShape::kK>
+    CK_TILE_DEVICE static constexpr auto MakeABlockTileDistribution()
+    {
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr auto a_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
+
+        return make_static_tile_distribution(a_block_dstr_encode);
+    }
+
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
index 173ef0a02..0181c0eec 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
@@ -59,14 +59,6 @@ struct BlockGemmARegBSmemCRegV2
 
         const index_t iNWarp = get_warp_id() % NWarp;
 
-        constexpr auto a_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<NWarp>,
-                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<1, 0>>,
-                                       tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-
         constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
             sequence<>,
             tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
@@ -75,19 +67,14 @@ struct BlockGemmARegBSmemCRegV2
             sequence<1, 2>,
             sequence<0, 0>>{};
 
-        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
-
         constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
             c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
 
-        constexpr auto a_block_dstr = make_static_tile_distribution(a_block_dstr_encode);
-
         // constrcut from A-block-tensor from A-Block-tensor-tmp
         // FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent
         // distribution
-        auto a_block_tensor =
-            make_static_distributed_tensor<typename ABlockTensorTmp::DataType>(a_block_dstr);
+        auto a_block_tensor = make_static_distributed_tensor<typename ABlockTensorTmp::DataType>(
+            MakeABlockTileDistribution());
 
         a_block_tensor.get_thread_buffer() = a_block_tensor_tmp.get_thread_buffer();
 
@@ -182,6 +169,33 @@ struct BlockGemmARegBSmemCRegV2
         });
     }
 
+    template <index_t MPerBlock = BlockGemmShape::kM, index_t KPerBlock = BlockGemmShape::kK>
+    CK_TILE_DEVICE static constexpr auto MakeABlockTileDistribution()
+    {
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr auto a_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
+
+        return make_static_tile_distribution(a_block_dstr_encode);
+    }
+
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
         constexpr index_t MPerBlock = BlockGemmShape::kM;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 89ea82c5b..1fd12973f 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -56,6 +56,14 @@ using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
+using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF16F16F32M4N64K4<WGAttrCtlEnum::Default_>,
+    4>>;
+
+using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF16F16F32M64N4K4<WGAttrCtlEnum::Default_>,
+    4>>;
+
 // bf16
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
@@ -104,6 +112,14 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
+using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4<WGAttrCtlEnum::Default_>,
+    4>>;
+
+using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4<WGAttrCtlEnum::Default_>,
+    4>>;
+
 // fp8
 
 using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index a9e466a79..e7d4c3796 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -28,6 +28,9 @@ struct WarpGemmAtrributeMfma
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
+    static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
+                  "Multi-block WarpGemmAttributeMfmaImpl is not supported");
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
@@ -94,30 +97,130 @@ struct WarpGemmAtrributeMfmaIterateK
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
-    using AWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
+    static_assert(Impl::kAMBlock == 1 || Impl::kBNBlock == 1,
+                  "Multi-block on both M & N directions is not supported");
 
-    using BWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
+    CK_TILE_DEVICE static constexpr auto get_awarp_dstr_encoding()
+    {
+        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kAMLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
+        {
+            // each M blocks share the same data
+            return tile_distribution_encoding<
+                sequence<Impl::kBNBlock>,
+                tuple<sequence<Impl::kAMLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<0, 2, 1>>,
+                tuple<sequence<0, 0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
+        {
+            // single block to multi-block thread mapping
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kAMBlock, Impl::kAMLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<1, 2, 1>>,
+                tuple<sequence<0, 0, 1>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+    }
 
-    using CWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
-              sequence<Impl::kCNLane>>,
-        tuple<sequence<1, 2>>,
-        tuple<sequence<1, 0>>,
-        sequence<1, 1>,
-        sequence<0, 2>>;
+    CK_TILE_DEVICE static constexpr auto get_bwarp_dstr_encoding()
+    {
+        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kBNLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
+        {
+            // single block to multi-block thread mapping
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kBNBlock, Impl::kBNLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<1, 2, 1>>,
+                tuple<sequence<0, 0, 1>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
+        {
+            // each N blocks share the same data
+            return tile_distribution_encoding<
+                sequence<Impl::kAMBlock>,
+                tuple<sequence<Impl::kBNLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<0, 2, 1>>,
+                tuple<sequence<0, 0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto get_cwarp_dstr_encoding()
+    {
+        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+                      sequence<Impl::kCNLane>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 0>>,
+                sequence<1, 1>,
+                sequence<0, 2>>{};
+        }
+        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+                      sequence<Impl::kBNBlock * Impl::kCNLane>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 0>>,
+                sequence<1, 1>,
+                sequence<0, 2>>{};
+        }
+        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<
+                    sequence<Impl::kCM0PerLane, Impl::kAMBlock * Impl::kCMLane, Impl::kCM1PerLane>,
+                    sequence<Impl::kCNLane>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 0>>,
+                sequence<1, 1>,
+                sequence<0, 2>>{};
+        }
+    }
+
+    using AWarpDstrEncoding = decltype(get_awarp_dstr_encoding());
+
+    using BWarpDstrEncoding = decltype(get_bwarp_dstr_encoding());
+
+    using CWarpDstrEncoding = decltype(get_cwarp_dstr_encoding());
 
     // c_vec += a_vec * b_vec
     template <bool post_nop_ = false>
@@ -206,6 +309,9 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
+    static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
+                  "Multi-block WarpGemmAttributeMfmaImpl is not supported");
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
@@ -270,6 +376,9 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
+    static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
+                  "Multi-block WarpGemmAttributeMfmaImpl is not supported");
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
@@ -341,30 +450,130 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
-    using AWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
+    static_assert(Impl::kAMBlock == 1 || Impl::kBNBlock == 1,
+                  "Multi-block on both M & N directions is not supported");
 
-    using BWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
+    CK_TILE_DEVICE static constexpr auto get_awarp_dstr_encoding()
+    {
+        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kBNLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
+        {
+            // single block to multi-block thread mapping
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kBNBlock, Impl::kBNLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<1, 2, 1>>,
+                tuple<sequence<0, 0, 1>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
+        {
+            // each N blocks share the same data
+            return tile_distribution_encoding<
+                sequence<Impl::kAMBlock>,
+                tuple<sequence<Impl::kBNLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<0, 2, 1>>,
+                tuple<sequence<0, 0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+    }
 
-    using CWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kCNLane>,
-              sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<1, 0>>,
-        sequence<2, 2>,
-        sequence<0, 2>>;
+    CK_TILE_DEVICE static constexpr auto get_bwarp_dstr_encoding()
+    {
+        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kAMLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
+        {
+            // each M blocks share the same data
+            return tile_distribution_encoding<
+                sequence<Impl::kBNBlock>,
+                tuple<sequence<Impl::kAMLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<0, 2, 1>>,
+                tuple<sequence<0, 0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
+        {
+            // single block to multi-block thread mapping
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kAMBlock, Impl::kAMLane>,
+                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                tuple<sequence<1, 2, 1>>,
+                tuple<sequence<0, 0, 1>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto get_cwarp_dstr_encoding()
+    {
+        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kCNLane>,
+                      sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 2>,
+                sequence<0, 2>>{};
+        }
+        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<Impl::kBNBlock * Impl::kCNLane>,
+                      sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 2>,
+                sequence<0, 2>>{};
+        }
+        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<
+                    sequence<Impl::kCNLane>,
+                    sequence<Impl::kCM0PerLane, Impl::kAMBlock * Impl::kCMLane, Impl::kCM1PerLane>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 2>,
+                sequence<0, 2>>{};
+        }
+    }
+
+    using AWarpDstrEncoding = decltype(get_awarp_dstr_encoding());
+
+    using BWarpDstrEncoding = decltype(get_bwarp_dstr_encoding());
+
+    using CWarpDstrEncoding = decltype(get_cwarp_dstr_encoding());
 
     template <bool post_nop_ = false>
     // c_vec += a_vec * b_vec
@@ -457,6 +666,9 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
+    static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
+                  "Multi-block WarpGemmAttributeMfmaImpl is not supported");
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
@@ -597,6 +809,9 @@ struct WarpGemmAtrributeMfmaIterateK_SwizzleA
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return kKIter; }
 
+    static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
+                  "Multi-block WarpGemmAttributeMfmaImpl is not supported");
+
     using AWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kAMLane / (Impl::kCMLane * SFactor * Impl::kCM1PerLane),
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 0aba1f535..fa24711de 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -78,6 +78,9 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
     static constexpr index_t kN = 32;
     static constexpr index_t kK = 8;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kAMLane     = 32;
     static constexpr index_t kBNLane     = 32;
     static constexpr index_t kABKLane    = 2;
@@ -138,6 +141,9 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
     static constexpr index_t kN = 16;
     static constexpr index_t kK = 16;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kAMLane     = 16;
     static constexpr index_t kBNLane     = 16;
     static constexpr index_t kABKLane    = 4;
@@ -182,6 +188,134 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF16F16F32M4N64K4
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 4>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 4;
+    static constexpr index_t kN = 64;
+    static constexpr index_t kK = 4;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 16;
+
+    // we only write down single block (4 threads) thread mapping here
+    static constexpr index_t kAMLane     = 4;
+    static constexpr index_t kBNLane     = 4;
+    static constexpr index_t kABKLane    = 1;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 1;
+    static constexpr index_t kCNLane     = 4;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4f16", Ctrl)
+        else
+        {
+#if defined(__gfx9__)
+            c_vec = __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ignore = c_vec;
+            ignore = a_vec;
+            ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx9__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ignore = a_vec;
+        ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF16F16F32M64N4K4
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 4>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 64;
+    static constexpr index_t kN = 4;
+    static constexpr index_t kK = 4;
+
+    static constexpr index_t kAMBlock = 16;
+    static constexpr index_t kBNBlock = 1;
+
+    // we only write down single block (4 threads) thread mapping here
+    static constexpr index_t kAMLane     = 4;
+    static constexpr index_t kBNLane     = 4;
+    static constexpr index_t kABKLane    = 1;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 1;
+    static constexpr index_t kCNLane     = 4;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4f16", Ctrl)
+        else
+        {
+#if defined(__gfx9__)
+            c_vec = __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ignore = c_vec;
+            ignore = a_vec;
+            ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx9__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_4x4x4f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ignore = a_vec;
+        ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 // Bf16
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
@@ -199,6 +333,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
     static constexpr index_t kN = 32;
     static constexpr index_t kK = 8;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kAMLane     = 32;
     static constexpr index_t kBNLane     = 32;
     static constexpr index_t kABKLane    = 2;
@@ -285,6 +422,9 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
     static constexpr index_t kN = 16;
     static constexpr index_t kK = 16;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kAMLane     = 16;
     static constexpr index_t kBNLane     = 16;
     static constexpr index_t kABKLane    = 4;
@@ -354,6 +494,134 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<bf16_t, 4>;
+    using BVecType = ext_vector_t<bf16_t, 4>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 4;
+    static constexpr index_t kN = 64;
+    static constexpr index_t kK = 4;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 16;
+
+    // we only write down single block (4 threads) thread mapping here
+    static constexpr index_t kAMLane     = 4;
+    static constexpr index_t kBNLane     = 4;
+    static constexpr index_t kABKLane    = 1;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 1;
+    static constexpr index_t kCNLane     = 4;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4bf16_1k", Ctrl)
+        else
+        {
+#if defined(__gfx9__)
+            c_vec = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ignore = c_vec;
+            ignore = a_vec;
+            ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx9__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ignore = a_vec;
+        ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<bf16_t, 4>;
+    using BVecType = ext_vector_t<bf16_t, 4>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 64;
+    static constexpr index_t kN = 4;
+    static constexpr index_t kK = 4;
+
+    static constexpr index_t kAMBlock = 16;
+    static constexpr index_t kBNBlock = 1;
+
+    // we only write down single block (4 threads) thread mapping here
+    static constexpr index_t kAMLane     = 4;
+    static constexpr index_t kBNLane     = 4;
+    static constexpr index_t kABKLane    = 1;
+    static constexpr index_t kABKPerLane = 4;
+
+    static constexpr index_t kCMLane     = 1;
+    static constexpr index_t kCNLane     = 4;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_4x4x4bf16_1k", Ctrl)
+        else
+        {
+#if defined(__gfx9__)
+            c_vec = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ignore = c_vec;
+            ignore = a_vec;
+            ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx9__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ignore = a_vec;
+        ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 // FP8
 template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
@@ -371,6 +639,9 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
     static constexpr index_t kN = 32;
     static constexpr index_t kK = 16;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kAMLane     = 32;
     static constexpr index_t kBNLane     = 32;
     static constexpr index_t kABKLane    = 2;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 99cd5d787..9c319b5e5 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -29,6 +29,8 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaF16F16F32M4N64K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaF16F16F32M64N4K16; };
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
@@ -42,6 +44,8 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M4N64K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M64N4K16; };
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
-- 
GitLab


From 1c45ca35dd5c215e0c1db1f40f01556f467f52a8 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Fri, 20 Dec 2024 16:40:45 +0800
Subject: [PATCH 138/153] hot-fix (#1768)

---
 .../ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp    | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index fa24711de..21a865e79 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -839,6 +839,9 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
     static constexpr index_t kN = 32;
     static constexpr index_t kK = 16;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kAMLane     = 32;
     static constexpr index_t kBNLane     = 32;
     static constexpr index_t kABKLane    = 2;
-- 
GitLab


From 07339c738396ebeae57374771ded4dcf11bddf1e Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 20 Dec 2024 07:52:24 -0800
Subject: [PATCH 139/153] fix typo for CK_USE_OCP_FP8 (#1769)

---
 include/ck/config.h.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/config.h.in b/include/ck/config.h.in
index 55a498073..2c37300e9 100644
--- a/include/ck/config.h.in
+++ b/include/ck/config.h.in
@@ -115,8 +115,8 @@
 #cmakedefine CK_USE_GFX94 @CK_USE_GFX94@
 #endif
 
-#ifndef DCK_USE_OCP_FP8
-#cmakedefine DCK_USE_OCP_FP8 @DCK_USE_OCP_FP8@
+#ifndef CK_USE_OCP_FP8
+#cmakedefine CK_USE_OCP_FP8 @CK_USE_OCP_FP8@
 #endif
 
 #ifndef CK_USE_FNUZ_FP8
-- 
GitLab


From 3d15f364b367b24ac709ea5687fa2d7d39f07cf9 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Mon, 23 Dec 2024 10:59:02 +0800
Subject: [PATCH 140/153] [CK_TILE] optimize moe-sorting kernel (#1771)

* opt moe sorting

* remove commented code
---
 .../13_moe_sorting/moe_sorting_api.cpp        |  53 ++--
 .../13_moe_sorting/script/smoke_test.sh       |   3 +-
 .../instances/fused_moesorting_api.cpp        |  53 ++--
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 247 +++++++++++++++---
 .../pipeline/moe_sorting_problem.hpp          |  13 +-
 5 files changed, 289 insertions(+), 80 deletions(-)

diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index 25e99c530..723fb3f69 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -3,18 +3,42 @@
 
 #include "moe_sorting_api.hpp"
 
-#define MOE_SORTING_DISPATCH(unroll_num_)                                                   \
-    constexpr ck_tile::index_t unroll_num = unroll_num_;                                    \
-    using ms_problem     = ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num>; \
-    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                           \
-    auto kargs           = kernel::MakeKargs(a);                                            \
-    const dim3 grids     = kernel::GridSize(a);                                             \
-    const dim3 blocks    = kernel::BlockSize(a);                                            \
-    const auto lds_bytes = kernel::GetSmemSize(a);                                          \
-    float ave_time       = ck_tile::launch_kernel(                                          \
-        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));          \
+#define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
+    constexpr ck_tile::index_t unroll_num  = unroll_num_;                             \
+    constexpr ck_tile::index_t expert_tile = expert_tile_;                            \
+    using ms_problem =                                                                \
+        ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num, expert_tile>; \
+    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                     \
+    auto kargs           = kernel::MakeKargs(a);                                      \
+    const dim3 grids     = kernel::GridSize(a);                                       \
+    const dim3 blocks    = kernel::BlockSize(a);                                      \
+    const auto lds_bytes = kernel::GetSmemSize(a);                                    \
+    float ave_time       = ck_tile::launch_kernel(                                    \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));    \
     return ave_time;
 
+#define MOE_SORTING_DISPATCH(unroll_num_)           \
+    if(a.num_experts <= 8)                          \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 8)  \
+    }                                               \
+    else if(a.num_experts <= 16)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 16) \
+    }                                               \
+    else if(a.num_experts <= 32)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 32) \
+    }                                               \
+    else if(a.num_experts <= 64)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 64) \
+    }                                               \
+    else                                            \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 0)  \
+    }
+
 float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
 {
     if(t.weight_type == "fp32" && t.index_type == "int32")
@@ -49,21 +73,12 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         case(6): {
             MOE_SORTING_DISPATCH(6);
         }
-        case(7): {
-            MOE_SORTING_DISPATCH(7);
-        }
         case(8): {
             MOE_SORTING_DISPATCH(8);
         }
-        case(9): {
-            MOE_SORTING_DISPATCH(9);
-        }
         case(10): {
             MOE_SORTING_DISPATCH(10);
         }
-        case(11): {
-            MOE_SORTING_DISPATCH(11);
-        }
         default: {
             MOE_SORTING_DISPATCH(4);
         }
diff --git a/example/ck_tile/13_moe_sorting/script/smoke_test.sh b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
index 1fc5eafcb..3ff8a7332 100644
--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
@@ -16,4 +16,5 @@ $EXE -t=127 -e=99 -k=19
 $EXE -t=71 -e=11 -k=11
 $EXE -t=1 -e=1 -k=1
 $EXE -t=99 -e=2 -k=1
-$EXE -t=333 -e=99 -k=13
\ No newline at end of file
+$EXE -t=333 -e=99 -k=13
+$EXE -t=128 -e=32 -k=5 -moe_buf_size=262144
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
index 75aaf86b7..7ca24c5c9 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -3,18 +3,42 @@
 
 #include "fused_moesorting.hpp"
 
-#define MOE_SORTING_DISPATCH(unroll_num_)                                                   \
-    constexpr ck_tile::index_t unroll_num = unroll_num_;                                    \
-    using ms_problem     = ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num>; \
-    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                           \
-    auto kargs           = kernel::MakeKargs(a);                                            \
-    const dim3 grids     = kernel::GridSize(a);                                             \
-    const dim3 blocks    = kernel::BlockSize(a);                                            \
-    const auto lds_bytes = kernel::GetSmemSize(a);                                          \
-    float ave_time       = ck_tile::launch_kernel(                                          \
-        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));          \
+#define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
+    constexpr ck_tile::index_t unroll_num  = unroll_num_;                             \
+    constexpr ck_tile::index_t expert_tile = expert_tile_;                            \
+    using ms_problem =                                                                \
+        ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num, expert_tile>; \
+    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                     \
+    auto kargs           = kernel::MakeKargs(a);                                      \
+    const dim3 grids     = kernel::GridSize(a);                                       \
+    const dim3 blocks    = kernel::BlockSize(a);                                      \
+    const auto lds_bytes = kernel::GetSmemSize(a);                                    \
+    float ave_time       = ck_tile::launch_kernel(                                    \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));    \
     return ave_time;
 
+#define MOE_SORTING_DISPATCH(unroll_num_)           \
+    if(a.num_experts <= 8)                          \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 8)  \
+    }                                               \
+    else if(a.num_experts <= 16)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 16) \
+    }                                               \
+    else if(a.num_experts <= 32)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 32) \
+    }                                               \
+    else if(a.num_experts <= 64)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 64) \
+    }                                               \
+    else                                            \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 0)  \
+    }
+
 float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s)
 {
     if(t.weight_type == "fp32" && t.index_type == "int32")
@@ -49,21 +73,12 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         case(6): {
             MOE_SORTING_DISPATCH(6);
         }
-        case(7): {
-            MOE_SORTING_DISPATCH(7);
-        }
         case(8): {
             MOE_SORTING_DISPATCH(8);
         }
-        case(9): {
-            MOE_SORTING_DISPATCH(9);
-        }
         case(10): {
             MOE_SORTING_DISPATCH(10);
         }
-        case(11): {
-            MOE_SORTING_DISPATCH(11);
-        }
         default: {
             MOE_SORTING_DISPATCH(4);
         }
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index d9e28ceb5..30e68996b 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -130,7 +130,8 @@ struct MoeSortingKernel
     CK_TILE_HOST static constexpr auto GetSmemSize(const Hargs& h)
     {
         const auto blocks = BlockSize(h);
-        return ((blocks.x + 1) * h.num_experts + (h.num_experts + 1)) * sizeof(index_t);
+        // usually num_experts is power of 2, we pad 1 dword here for the row-size
+        return ((blocks.x + 1) * (h.num_experts + 1) + (h.num_experts + 1)) * sizeof(index_t);
     }
 
     CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
@@ -154,6 +155,75 @@ struct MoeSortingKernel
         return k;
     }
 
+        // [a, b, c, d....] -> [a, a+b, a+b+c, a+b+c+d, ....]
+    template <typename data_t, int wave_size>
+    __device__ inline void wave_cumsum(data_t& thread_data) const
+    {
+        // wave_size must be power of 2
+        constexpr int row_mask    = 0xf;
+        constexpr int bank_mask   = 0xf;
+        constexpr bool bound_ctrl = true;   // ! out-of-bound is zero !
+        auto reduce_op = [&](auto x_, auto y_) { return x_ + y_; };
+
+        if constexpr(wave_size > 1)
+        {
+            thread_data = reduce_op(
+                thread_data,
+                __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                            0x111,
+                                                            row_mask,
+                                                            bank_mask,
+                                                            bound_ctrl))); // row_shr:1
+        }
+
+        if constexpr(wave_size > 2)
+        {
+            thread_data = reduce_op(
+                thread_data,
+                __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                            0x112,
+                                                            row_mask,
+                                                            bank_mask,
+                                                            bound_ctrl))); // row_shr:2
+        }
+        if constexpr(wave_size > 4)
+        {
+            thread_data =
+                reduce_op(thread_data,
+                        __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                                        0x114,
+                                                                        row_mask,
+                                                                        bank_mask,
+                                                                        bound_ctrl))); // row_shr:4
+        }
+        if constexpr(wave_size > 8)
+        {
+            thread_data =
+                reduce_op(thread_data,
+                        __builtin_bit_cast(data_t, __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                                        0x118,
+                                                                        row_mask,
+                                                                        bank_mask,
+                                                                        bound_ctrl))); // row_shr:8
+        }
+
+        if constexpr(wave_size > 16)
+        {
+            // now row-0, row-0+row-1, row-1+row-2, row-2+row-3
+            int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 1) << 2, __builtin_bit_cast(int, thread_data));
+            v_remote_tmp = __lane_id() >= 16 ? v_remote_tmp : 0;
+            thread_data = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp));
+        }
+
+        if constexpr(wave_size > 32)
+        {
+            // lane-id 48...63->31
+            int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 17) << 2, __builtin_bit_cast(int, thread_data));
+            v_remote_tmp = __lane_id() >= 32 ? v_remote_tmp : 0;
+            thread_data = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp));
+        }
+    }
+
     CK_TILE_DEVICE index_t calc_index(index_t total_col, index_t row, index_t col) const
     {
         return row * total_col + col;
@@ -187,48 +257,124 @@ struct MoeSortingKernel
         index_t* shared_mem = reinterpret_cast<index_t*>(smem);
 
         index_t* tokens_cnts = shared_mem; // 2d: (blockDim.x + 1, num_experts)
-        index_t* cumsum      = shared_mem + (blockDim.x + 1) * num_experts; // 1: (num_experts + 1)
+        index_t* cumsum      = shared_mem + (blockDim.x + 1) * (num_experts+1); // 1: (num_experts + 1)
+
         for(int i = 0; i < num_experts; ++i)
         {
-            tokens_cnts[calc_index(num_experts, tid + 1, i)] = 0;
+            tokens_cnts[calc_index(num_experts+1, tid + 1, i)] = 0;
         }
+
 #pragma unroll Problem_::InternalLoadUnroll
         for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
         {
-            ++tokens_cnts[calc_index(num_experts, tid + 1, topk_id[i])];
+            ++tokens_cnts[calc_index(num_experts+1, tid + 1, topk_id[i])];
         }
         __syncthreads();
 
+#if 1
         if(tid < num_experts)
         {
-            tokens_cnts[calc_index(num_experts, 0, tid)] = 0;
-            for(int i = 1; i <= static_cast<index_t>(blockDim.x); ++i)
+            tokens_cnts[calc_index(num_experts+1, 0, tid)] = 0;
+            index_t local_c[8];
+            index_t prev_c = 0;
+            // TODO: manually unroll. pragma unroll does not work well when we have dependency
+            for(int i = 1; i <= static_cast<index_t>(blockDim.x); i+= 8)
             {
-                tokens_cnts[calc_index(num_experts, i, tid)] +=
-                    tokens_cnts[calc_index(num_experts, i - 1, tid)];
+                local_c[0] = tokens_cnts[calc_index(num_experts+1, i + 0, tid)];
+                local_c[1] = tokens_cnts[calc_index(num_experts+1, i + 1, tid)];
+                local_c[2] = tokens_cnts[calc_index(num_experts+1, i + 2, tid)];
+                local_c[3] = tokens_cnts[calc_index(num_experts+1, i + 3, tid)];
+                local_c[4] = tokens_cnts[calc_index(num_experts+1, i + 4, tid)];
+                local_c[5] = tokens_cnts[calc_index(num_experts+1, i + 5, tid)];
+                local_c[6] = tokens_cnts[calc_index(num_experts+1, i + 6, tid)];
+                local_c[7] = tokens_cnts[calc_index(num_experts+1, i + 7, tid)];
+
+                local_c[0] += prev_c;
+                local_c[1] += local_c[0];
+                local_c[2] += local_c[1];
+                local_c[3] += local_c[2];
+                local_c[4] += local_c[3];
+                local_c[5] += local_c[4];
+                local_c[6] += local_c[5];
+                local_c[7] += local_c[6];
+                prev_c = local_c[7];
+
+                tokens_cnts[calc_index(num_experts+1, i + 0, tid)] = local_c[0];
+                tokens_cnts[calc_index(num_experts+1, i + 1, tid)] = local_c[1];
+                tokens_cnts[calc_index(num_experts+1, i + 2, tid)] = local_c[2];
+                tokens_cnts[calc_index(num_experts+1, i + 3, tid)] = local_c[3];
+                tokens_cnts[calc_index(num_experts+1, i + 4, tid)] = local_c[4];
+                tokens_cnts[calc_index(num_experts+1, i + 5, tid)] = local_c[5];
+                tokens_cnts[calc_index(num_experts+1, i + 6, tid)] = local_c[6];
+                tokens_cnts[calc_index(num_experts+1, i + 7, tid)] = local_c[7];
             }
         }
-
-        // __syncthreads();
-        if(tid == 0)
+#else
+        // TODO: below code still working, but slow in expert=32/topk=5 case. Put here for future heuristic
         {
-            cumsum[0] = 0;
-            for(int i = 1; i <= num_experts; ++i)
+            if(tid < num_experts)
+                tokens_cnts[calc_index(num_experts+1, 0, tid)] = 0;
+            for(int i = 0; i < num_experts; i+=8) {
+                index_t local_c[8];
+                #pragma unroll
+                for(int j = 0; j < 8; j++) {
+                    local_c[j] = tokens_cnts[calc_index(num_experts+1, tid+1, i+j)];
+                }
+
+                #pragma unroll
+                for(int j = 0; j < 8; j++) {
+                    wave_cumsum<int, 64>(local_c[j]);
+                }
+
+                #pragma unroll
+                for(int j = 0; j < 8; j++) {
+                    tokens_cnts[calc_index(num_experts+1, tid+1, i+j)] = local_c[j];
+                }
+            }
+        }
+#endif
+
+        __syncthreads();
+        if constexpr (Problem::ExpertTile == 0) {
+            if(tid == 0)
             {
-                auto current_units = [&]() {
-                    index_t x_ = tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)] +
-                                 unit_size_mdiv.divisor - 1;
-                    index_t y_ = unit_size_mdiv.div(x_);
-                    return max(y_, 1) * unit_size_mdiv.divisor;
-                }();
-                cumsum[i] = cumsum[i - 1] + current_units;
+                cumsum[0] = 0;
+                for(int i = 1; i <= num_experts; ++i)
+                {
+                    auto current_units = [&]() {
+                        index_t x_ = tokens_cnts[calc_index(num_experts+1, blockDim.x, i - 1)] +
+                                    unit_size_mdiv.divisor - 1;
+                        index_t y_ = unit_size_mdiv.div(x_);
+                        return max(y_, 1) * unit_size_mdiv.divisor;
+                    }();
+                    cumsum[i] = cumsum[i - 1] + current_units;
+                }
+                *p_total_tokens_post_pad = cumsum[num_experts];
+            }
+        } else {
+            // TODO: we have out-of-bound read here. But result is still OK (will ignore tid >= expert)
+            // for simplicity, not check experts here.
+            int local_cnt = tokens_cnts[calc_index(num_experts+1, blockDim.x, tid)];
+            int blocks_pers_expert = unit_size_mdiv.div(local_cnt + unit_size_mdiv.divisor - 1);
+            int padded_tokens_per_expert = max(blocks_pers_expert, 1) * unit_size_mdiv.divisor;
+            int local_cumsum = padded_tokens_per_expert;
+            wave_cumsum<int, 64>(local_cumsum);
+
+            if(tid == (num_experts - 1)) {
+                cumsum[0] = 0;
+                *p_total_tokens_post_pad = local_cumsum;
+            }
+            if(tid < num_experts) {
+                cumsum[tid + 1] = local_cumsum;
             }
-            *p_total_tokens_post_pad = cumsum[num_experts];
         }
+
         __syncthreads();
         if(tid < num_experts)
         {
-            for(int i = cumsum[tid]; i < cumsum[tid + 1]; i += unit_size_mdiv.divisor)
+            int e_start = cumsum[tid];
+            int e_end = cumsum[tid + 1];
+            for(int i = e_start; i < e_end; i += unit_size_mdiv.divisor)
             {
                 p_sorted_expert_ids[unit_size_mdiv.div(i)] = tid;
             }
@@ -238,8 +384,8 @@ struct MoeSortingKernel
         for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
         {
             index_t expert_id = topk_id[i];
-            index_t rank_post_pad =
-                tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id];
+            index_t local_cnt = tokens_cnts[calc_index(num_experts+1, tid, expert_id)];
+            index_t rank_post_pad = local_cnt + cumsum[expert_id];
 #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
             uint32_t curr_token_id, curr_topk_id;
             topk_mdiv.divmod(i, curr_token_id, curr_topk_id);
@@ -247,27 +393,54 @@ struct MoeSortingKernel
 #else
             p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i);
 #endif
-            p_sorted_weights[rank_post_pad] = weights[i];
-            ++tokens_cnts[calc_index(num_experts, tid, expert_id)];
+            p_sorted_weights[rank_post_pad] = weights[i];           
+            tokens_cnts[calc_index(num_experts+1, tid, expert_id)] = local_cnt+1;
         }
 
-        const index_t prefill_token = topk_mdiv.div(numel);
-        if(tid < num_experts)
-        {
-            index_t expert_offset =
-                cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)];
-            while(expert_offset < cumsum[tid + 1])
+        if constexpr (Problem::ExpertTile == 0) {
+            const index_t prefill_token = topk_mdiv.div(numel);
+            if(tid < num_experts)
             {
+                index_t expert_offset =
+                    cumsum[tid] + tokens_cnts[calc_index(num_experts+1, blockDim.x, tid)];
+                index_t expert_end = cumsum[tid + 1];
+                while(expert_offset < expert_end)
+                {
 #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                p_sorted_token_ids[expert_offset] =
-                    MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor);
+                    p_sorted_token_ids[expert_offset] =
+                        MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor);
 #else
-                p_sorted_token_ids[expert_offset] = prefill_token;
+                    p_sorted_token_ids[expert_offset] = prefill_token;
 #endif
-                p_sorted_weights[expert_offset] = static_cast<WeightType>(0.0);
-                expert_offset++;
+                    p_sorted_weights[expert_offset] = static_cast<WeightType>(0.0);
+                    expert_offset++;
+                }
             }
         }
+        else {
+            const index_t prefill_token = topk_mdiv.div(numel);
+            // TODO: only support expert-tile like 8, 16, 32
+            static constexpr index_t experts_per_wave = warpSize / Problem::ExpertTile;
+            {
+                index_t eid = tid / experts_per_wave;
+                index_t expert_offset =
+                    cumsum[eid] + tokens_cnts[calc_index(num_experts+1, blockDim.x, eid)] + tid % experts_per_wave;
+                index_t expert_end = cumsum[eid + 1];
+                if(eid < num_experts) {
+                    while(expert_offset < expert_end)
+                    {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                        p_sorted_token_ids[expert_offset] =
+                            MOE_SORTING_MOCK_ID(prefill_token, topk_mdiv.divisor);
+#else
+                        p_sorted_token_ids[expert_offset] = prefill_token;
+#endif
+                        p_sorted_weights[expert_offset] = static_cast<WeightType>(0.0);
+                        expert_offset+=experts_per_wave;
+                    }
+                }
+            }    
+        }
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp
index adde59e35..50005c440 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_problem.hpp
@@ -9,15 +9,20 @@
 
 namespace ck_tile {
 
-template <typename IndexType_, typename WeightType_, index_t InternalLoadUnroll_>
+template <typename IndexType_,
+          typename WeightType_,
+          index_t InternalLoadUnroll_,
+          index_t ExpertTile_ = 0>
 struct MoeSortingProblem
 {
     // TODO: this kernel only support warp per row
     using WeightType = remove_cvref_t<WeightType_>;
     using IndexType  = remove_cvref_t<IndexType_>;
 
-    static constexpr index_t WarpSize           = get_warp_size();
-    static constexpr index_t WarpsPerBlock      = 1;
-    static constexpr index_t InternalLoadUnroll = InternalLoadUnroll_;
+    static constexpr index_t WarpSize      = get_warp_size();
+    static constexpr index_t WarpsPerBlock = 1;
+    static constexpr index_t InternalLoadUnroll =
+        InternalLoadUnroll_;                           // TODO: need better design(like tile size)
+    static constexpr index_t ExpertTile = ExpertTile_; // TODO: only used in store out
 };
 } // namespace ck_tile
-- 
GitLab


From 4c2eff023a26821512a100171531dc8757ad0e8f Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Wed, 25 Dec 2024 23:57:28 +0800
Subject: [PATCH 141/153] Correct the dtype checking logics (#1775)

---
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index df5b9cecc..2f7edd547 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -261,7 +261,7 @@ FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F
                 static_assert({F_bn1} % 32 == 0);
 
                 if (t.has_lse) {{
-                    if constexpr (std::is_same_v<{F_dtype}, ck_tile::fp8_t>) {{
+                    if constexpr (std::is_same_v<{F_dtype}, FmhaFwdFp8>) {{
                         return -1;
                     }} else {{
                         using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, /*F_bn1=*/32, true, {F_squant}, {F_spad}, {F_dvpad}>;
@@ -614,7 +614,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
     }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdSplitKVCombineTileSize(32,   -1),
+            '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
         }
-- 
GitLab


From af66494880fc6256e5e1ced779b6d80446726970 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Sat, 28 Dec 2024 14:40:17 +0100
Subject: [PATCH 142/153] [CK TILE] GEMM and Batched GEMM SplitK support
 (#1724)

* [CK TILE] Add split K support in GEMM

* Updates

* Fixes

* rebase

* fix

* Fix

* fixes

* support for batched gemm
---
 example/ck_tile/03_gemm/gemm_basic.hpp        |   6 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |   8 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  20 +--
 .../ck_tile/16_batched_gemm/batched_gemm.cpp  |  13 +-
 .../ck_tile/16_batched_gemm/batched_gemm.hpp  |   3 +-
 .../run_batched_gemm_example.inc              |   4 +
 .../ops/epilogue/cshuffle_epilogue.hpp        |  31 +++-
 .../ops/epilogue/default_2d_epilogue.hpp      |  26 ++-
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   |  32 +++-
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 164 +++++++++++++-----
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |   2 +
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |   2 +
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     |   2 +
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |  14 +-
 .../gemm_pipeline_agmem_bgmem_creg_v2.hpp     |   2 +
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |   2 +
 .../batched_gemm/test_batched_gemm_util.hpp   |   3 +-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |   4 +-
 18 files changed, 246 insertions(+), 92 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp
index 58cdaea7d..38c0a279d 100644
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
@@ -54,8 +54,7 @@ using CDataType   = Types::CDataType;
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("b", "1", "batch size")
-        .insert("m", "3840", "m dimension")
+    arg_parser.insert("m", "3840", "m dimension")
         .insert("n", "4096", "n dimension")
         .insert("k", "2048", "k dimension")
         .insert("a_layout", "R", "A tensor data layout - Row by default")
@@ -68,7 +67,8 @@ auto create_args(int argc, char* argv[])
         .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 68df389bf..56d0348bd 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -64,9 +64,9 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
     ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
 
-    ck_tile::index_t batch_size = arg_parser.get_int("b");
-    int n_warmup                = arg_parser.get_int("warmup");
-    int n_repeat                = arg_parser.get_int("repeat");
+    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+    int n_warmup            = arg_parser.get_int("warmup");
+    int n_repeat            = arg_parser.get_int("repeat");
 
     using namespace ck_tile::literals;
 
@@ -133,7 +133,7 @@ int run_gemm_example_with_layouts(int argc,
                                            stride_A,
                                            stride_B,
                                            stride_C,
-                                           batch_size,
+                                           kbatch,
                                            n_warmup,
                                            n_repeat);
 
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 6c87ca008..1a9e025a9 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -22,7 +22,7 @@
 #endif
 
 template <typename ALayout, typename BLayout, typename CLayout>
-float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
+float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
     // Memory friendly for Interwave scheduler
@@ -78,7 +78,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 #endif
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
 
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
+    const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
     const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
@@ -106,17 +108,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
                                                   has_hot_loop_v,
                                                   tail_number_v>>;
         using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKargs(args.p_a,
-                                       args.p_b,
-                                       args.p_c,
-                                       args.M,
-                                       args.N,
-                                       args.K,
-                                       args.stride_A,
-                                       args.stride_B,
-                                       args.stride_C);
-
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
         constexpr dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index 9b4ed9a9e..b9c9eaa58 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -70,20 +70,25 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
 
     using CodegenGemmTraits =
         ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-
-    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy;
+    using CodegenGemmPipeline =
+        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
     // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
     // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
     using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
     auto kargs = Kernel::MakeKernelArgs(args);
 
-    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.batch_count);
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
     constexpr dim3 blocks = Kernel::BlockSize();
 
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
+
     if(s.log_level_ > 0)
     {
         std::cout << "Launching kernel with args:"
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
index f0c0c9efb..62f0058fd 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
@@ -49,7 +49,8 @@ auto create_args(int argc, char* argv[])
         .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
index 4e7218b5b..c14bb5668 100644
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -17,6 +17,7 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                           ck_tile::index_t batch_stride_B,
                           ck_tile::index_t batch_stride_C,
                           ck_tile::index_t batch_count,
+                          ck_tile::index_t kbatch,
                           int n_warmup,
                           int n_repeat)
 {
@@ -24,6 +25,7 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
     args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
     args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
+    args.k_batch        = kbatch;
     args.M              = M;
     args.N              = N;
     args.K              = K;
@@ -79,6 +81,7 @@ int run_batched_gemm_example_with_layouts(int argc,
     ck_tile::index_t batch_stride_B = arg_parser.get_int("batch_stride_b");
     ck_tile::index_t batch_stride_C = arg_parser.get_int("batch_stride_c");
     ck_tile::index_t batch_count    = arg_parser.get_int("batch_count");
+    ck_tile::index_t kbatch         = arg_parser.get_int("split_k");
 
     int n_warmup = arg_parser.get_int("warmup");
     int n_repeat = arg_parser.get_int("repeat");
@@ -159,6 +162,7 @@ int run_batched_gemm_example_with_layouts(int argc,
                                                    batch_stride_B,
                                                    batch_stride_C,
                                                    batch_count,
+                                                   kbatch,
                                                    n_warmup,
                                                    n_repeat);
 
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 9625b137b..01105d2a8 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -56,6 +56,13 @@ struct CShuffleEpilogue
     // No additional shared memory needed
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }
 
+    CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed()
+    {
+        // TODO: At now CShuffle doesn't allow to vector store after permute.
+        //       It should be fixed and this function should return true.
+        return false;
+    }
+
     template <typename OAccTile>
     CK_TILE_DEVICE void permute_tile_data(OAccTile& o_acc_tile)
     {
@@ -111,7 +118,9 @@ struct CShuffleEpilogue
         }
     }
 
-    template <typename ODramWindowTmp, typename OAccTile>
+    template <typename ODramWindowTmp,
+              typename OAccTile,
+              memory_operation_enum out_memory_data_op = memory_operation_enum::set>
     CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, OAccTile& o_acc_tile)
     {
         const auto& current_window_origin = o_dram_window_tmp.get_window_origin();
@@ -158,12 +167,26 @@ struct CShuffleEpilogue
         // Store the tile data to the permuted location
         if constexpr(kPadM || kPadN)
         {
-            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
             buffer_store_fence();
         }
         else
         {
-            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
         }
     }
 };
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 7c5d5a6f3..177573de3 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -35,21 +35,39 @@ struct Default2DEpilogue
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }
 
+    CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed() { return false; }
+
     // TODO: this function assume store out vector size is the same as OAccTile last dimension size
     //       how do we fix this ?
-    template <typename ODramWindowTmp, typename OAccTile>
+    template <typename ODramWindowTmp,
+              typename OAccTile,
+              memory_operation_enum out_memory_data_op = memory_operation_enum::set>
     CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile)
     {
 
         // TODO: this is ugly
         if constexpr(UseRawStore && (kPadM || kPadN))
         {
-            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
             buffer_store_fence();
         }
         else
         {
-            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
         }
     }
 };
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index 07a4cf8fb..eaf66237a 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -67,9 +67,10 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
     using KernelArgs = BatchedGemmKernelArgs;
 
-    __host__ static constexpr auto GridSize(index_t M, index_t N, index_t batch_count)
+    __host__ static constexpr auto
+    GridSize(index_t M, index_t N, index_t KBatch, index_t batch_count)
     {
-        return TilePartitioner::GridSize(M, N, batch_count);
+        return TilePartitioner::GridSize(M, N, KBatch * batch_count);
     }
 
     __host__ static constexpr auto BlockSize() { return dim3(Base::KernelBlockSize); }
@@ -85,7 +86,8 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
                                       hostArgs.K,
                                       hostArgs.stride_A,
                                       hostArgs.stride_B,
-                                      hostArgs.stride_C},
+                                      hostArgs.stride_C,
+                                      hostArgs.k_batch},
                                      hostArgs.batch_stride_A,
                                      hostArgs.batch_stride_B,
                                      hostArgs.batch_stride_C,
@@ -100,22 +102,38 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
     CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
     {
         const auto [i_m, i_n] = TilePartitioner{}();
-        const auto i_batch    = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const auto i_batch    = __builtin_amdgcn_readfirstlane(blockIdx.z / kargs.KBatch);
+        const auto i_k        = __builtin_amdgcn_readfirstlane(blockIdx.z - i_batch * kargs.KBatch);
+
+        const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, i_k);
 
         //  options
         const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
         const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
-        const ADataType* a_ptr    = static_cast<const ADataType*>(kargs.a_ptr) + batch_offset_A;
+        const ADataType* a_ptr    = static_cast<const ADataType*>(kargs.a_ptr) + batch_offset_A +
+                                 splitk_batch_offset.a_k_split_offset;
 
         const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
         const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
-        const BDataType* b_ptr    = static_cast<const BDataType*>(kargs.b_ptr) + batch_offset_B;
+        const BDataType* b_ptr    = static_cast<const BDataType*>(kargs.b_ptr) + batch_offset_B +
+                                 splitk_batch_offset.b_k_split_offset;
 
         const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C);
         const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C);
         CDataType* c_ptr          = static_cast<CDataType*>(kargs.c_ptr) + batch_offset_C;
 
-        this->RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n);
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        if(kargs.KBatch == 1)
+        {
+            this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
+        else
+        {
+            this->template RunGemm<memory_operation_enum::atomic_add>(
+                a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 925648a88..c81a64f7a 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -93,6 +93,7 @@ struct GemmKernel
         index_t stride_A;
         index_t stride_B;
         index_t stride_C;
+        index_t KBatch;
     };
 
     CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs)
@@ -105,28 +106,72 @@ struct GemmKernel
                               hostArgs.K,
                               hostArgs.stride_A,
                               hostArgs.stride_B,
-                              hostArgs.stride_C};
+                              hostArgs.stride_C,
+                              hostArgs.k_batch};
     }
-    // CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const void* a_ptr,
-    //                                                             const void* b_ptr,
-    //                                                             void* c_ptr,
-    //                                                             index_t M,
-    //                                                             index_t N,
-    //                                                             index_t K,
-    //                                                             index_t stride_A,
-    //                                                             index_t stride_B,
-    //                                                             index_t stride_C)
-    // {
-    //     return GemmKernelArgs{a_ptr, b_ptr, c_ptr, M, N, K, stride_A, stride_B, stride_C};
-    // }
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const GemmKernelArgs& kargs,
+                                     const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = kargs.KBatch * K1;
+            const index_t KRead = (kargs.K + K_t - 1) / K_t * K1;
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead * kargs.stride_A;
+            }
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead * kargs.stride_B;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead;
+            }
+
+            if(k_id < static_cast<uint32_t>(kargs.KBatch - 1))
+            {
+                splitted_k = KRead;
+            }
+            else
+            {
+                splitted_k = kargs.K - KRead * (kargs.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t splitted_k;
+    };
+
     CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs)
     {
+        constexpr bool is_output_c_reg_transposed =
+            EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC();
+        if constexpr(!((GemmPipeline::VectorSizeC % 2 == 0 &&
+                        std::is_same_v<CLayout, tensor_layout::gemm::RowMajor> &&
+                        is_output_c_reg_transposed) ||
+                       !(std::is_same_v<CDataType, fp16_t> || std::is_same_v<CDataType, bf16_t>)))
+        {
+            if(kargs.KBatch != 1)
+            {
+                return false;
+            }
+        }
+
         if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
         {
             if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false)
@@ -198,17 +243,19 @@ struct GemmKernel
         return true;
     }
 
-    CK_TILE_DEVICE auto MakeGemmTensorViews(const ADataType* a_ptr,
-                                            const BDataType* b_ptr,
-                                            CDataType* c_ptr,
-                                            const GemmKernelArgs& kargs) const
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
+                                                   const BDataType* b_ptr,
+                                                   CDataType* c_ptr,
+                                                   const GemmKernelArgs& kargs,
+                                                   const SplitKBatchOffset& splitk_batch_offset)
     {
         const auto& a_tensor_view = [&]() {
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global>(
                     a_ptr,
-                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
                     make_tuple(kargs.stride_A, 1),
                     number<GemmPipeline::VectorSizeA>{},
                     number<1>{});
@@ -217,7 +264,7 @@ struct GemmKernel
             {
                 return make_naive_tensor_view<address_space_enum::global>(
                     a_ptr,
-                    make_tuple(kargs.M, kargs.K),
+                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
                     make_tuple(1, kargs.stride_A),
                     number<1>{},
                     number<1>{});
@@ -229,7 +276,7 @@ struct GemmKernel
             {
                 return make_naive_tensor_view<address_space_enum::global>(
                     b_ptr,
-                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(kargs.N, splitk_batch_offset.splitted_k),
                     make_tuple(1, kargs.stride_B),
                     number<1>{},
                     number<1>{});
@@ -238,7 +285,7 @@ struct GemmKernel
             {
                 return make_naive_tensor_view<address_space_enum::global>(
                     b_ptr,
-                    make_tuple(kargs.N, kargs.K),
+                    make_tuple(kargs.N, splitk_batch_offset.splitted_k),
                     make_tuple(kargs.stride_B, 1),
                     number<GemmPipeline::VectorSizeB>{},
                     number<1>{});
@@ -248,7 +295,7 @@ struct GemmKernel
         const auto& c_tensor_view = [&]() {
             if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
             {
-                return make_naive_tensor_view<address_space_enum::global>(
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
                     c_ptr,
                     make_tuple(kargs.M, kargs.N),
                     make_tuple(kargs.stride_C, 1),
@@ -257,7 +304,7 @@ struct GemmKernel
             }
             else
             {
-                return make_naive_tensor_view<address_space_enum::global>(
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
                     c_ptr,
                     make_tuple(kargs.M, kargs.N),
                     make_tuple(1, kargs.stride_C),
@@ -270,7 +317,7 @@ struct GemmKernel
     }
 
     template <typename TensorView>
-    CK_TILE_DEVICE auto MakeGemmPadViews(const TensorView& views) const
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
     {
         const auto& a_pad_view = [&]() {
             const auto& a_tensor_view = views.at(I0);
@@ -330,8 +377,8 @@ struct GemmKernel
     }
 
     template <typename PadView>
-    CK_TILE_DEVICE auto
-    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n) const
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
     {
         const auto& a_pad_view     = views.at(I0);
         const auto& a_block_window = make_tile_window(
@@ -363,23 +410,27 @@ struct GemmKernel
      * @param kargs GEMM kernel arguments
      * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
      * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     * @tparam DstInMemOp Destination memory operation (default: set).
      */
-    CK_TILE_DEVICE void RunGemm(const ADataType* a_ptr,
-                                const BDataType* b_ptr,
-                                CDataType* c_ptr,
-                                const GemmKernelArgs& kargs,
-                                const index_t block_idx_m,
-                                const index_t block_idx_n) const
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
+                                       const BDataType* b_ptr,
+                                       CDataType* c_ptr,
+                                       void* smem_ptr,
+                                       const GemmKernelArgs& kargs,
+                                       const SplitKBatchOffset& splitk_batch_offset,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n)
     {
         // Create Gemm tensor views, pad views and tile windows
-        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews(a_ptr, b_ptr, c_ptr, kargs);
-        const auto& gemm_pad_views          = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
-
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+        ;
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K);
+        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -389,18 +440,43 @@ struct GemmKernel
 
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I2);
-        EpiloguePipeline{}(c_block_window, c_block_tile);
+
+        constexpr bool is_output_c_reg_transposed =
+            EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC();
+        if constexpr((DstInMemOp == memory_operation_enum::set) || (sizeof(CDataType) > 2) ||
+                     (GemmPipeline::VectorSizeC % 2 == 0 &&
+                      std::is_same_v<CLayout, tensor_layout::gemm::RowMajor> &&
+                      is_output_c_reg_transposed))
+        {
+            EpiloguePipeline{}
+                .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
+                    c_block_window, c_block_tile);
+        }
     }
 
     CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
     {
         const auto [i_m, i_n] = TilePartitioner{}();
+        const SplitKBatchOffset splitk_batch_offset(kargs);
         // options
-        const ADataType* a_ptr = static_cast<const ADataType*>(kargs.a_ptr);
-        const BDataType* b_ptr = static_cast<const BDataType*>(kargs.b_ptr);
-        CDataType* c_ptr       = static_cast<CDataType*>(kargs.c_ptr);
+        const ADataType* a_ptr =
+            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+        const BDataType* b_ptr =
+            static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
+        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
 
-        RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n);
+        if(kargs.KBatch == 1)
+        {
+            RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
+        else
+        {
+            RunGemm<memory_operation_enum::atomic_add>(
+                a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index a72728b4a..40628b186 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -82,6 +82,8 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         return Policy::template GetSmemSize<Problem>();
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
+
     template <GemmPipelineScheduler Scheduler>
     struct PipelineImpl : public PipelineImplBase
     {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index e2e94cf92..c7a74c81e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -132,6 +132,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
         return Policy::template GetSmemSize<Problem>();
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
+
     template <GemmPipelineScheduler Scheduler>
     struct PipelineImpl : public PipelineImplBase
     {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 822748c69..11a18e52c 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -53,6 +53,8 @@ struct GemmPipelineAGmemBGmemCRegV1
         return Policy::template GetSmemSize<Problem>();
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
+
     template <typename ADramBlockWindowTmp,
               typename BDramBlockWindowTmp,
               typename AElementFunction,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index b475ebb7b..d0cc1ed9c 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -13,6 +13,8 @@ namespace ck_tile {
 struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
 {
 
+    static constexpr bool TransposeC = false;
+
 #if 0
     // 2d
     template <typename Problem>
@@ -114,8 +116,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     {
         constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
         constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
-        index_t smem_size             = 0;
-        smem_size += smem_size_a + smem_size_b;
+        constexpr index_t smem_size   = smem_size_a + smem_size_b;
 
         return smem_size;
     }
@@ -485,13 +486,14 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
         }
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; }
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        constexpr bool TransposeC = false;
-        constexpr auto I0         = number<0>{};
-        constexpr auto I1         = number<1>{};
-        constexpr auto I2         = number<2>{};
+        constexpr auto I0 = number<0>{};
+        constexpr auto I1 = number<1>{};
+        constexpr auto I2 = number<2>{};
 
         using AccDataType     = float;
         using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index 96a5a61c8..07d4dc441 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -36,6 +36,8 @@ struct GemmPipelineAGmemBGmemCRegV2
                    Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
+
     template <typename ADramBlockWindowTmp,
               typename BDramBlockWindowTmp,
               typename AElementFunction,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 94b0faf03..6c317916b 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -444,6 +444,8 @@ struct UniversalGemmPipelineAgBgCrPolicy
         }
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; }
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index d3f307787..e7e9b3d67 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -93,7 +93,7 @@ class TestCkTileBatchedGemm : public ::testing::Test
 
         auto kargs = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.batch_count);
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
         constexpr dim3 blocks = Kernel::BlockSize();
 
         if(s.log_level_ > 0)
@@ -186,6 +186,7 @@ class TestCkTileBatchedGemm : public ::testing::Test
         args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
         args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
         args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
+        args.k_batch        = 1;
         args.M              = M;
         args.N              = N;
         args.K              = K;
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 53ead4d8d..4b0e40060 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -74,7 +74,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
                 ck_tile::
                     GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>>;
 
-        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
+        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
         const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
         const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
-- 
GitLab


From 4e076909b6c1e1404d9ff5dc0e71e3be1c06569e Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Sun, 29 Dec 2024 14:29:56 +0800
Subject: [PATCH 143/153] Remove using partitioner for all fmha kernels (#1778)

* Remove using tile partitioner for fmha_fwd_kernel

* Remove using tile partitioner for fmha_fwd_splitkv and splitkv-combine kernels

* Remove using tile partitioner for fmha_fwd_appendkv kernel

* Unify the format of GetTileIndex
---
 example/ck_tile/01_fmha/README.md             |   3 +-
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  20 +---
 .../01_fmha/codegen/ops/fmha_fwd_appendkv.py  |   6 +-
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |  10 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |  14 ++-
 include/ck_tile/ops/fmha.hpp                  |   3 -
 .../fmha/kernel/fmha_fwd_appendkv_kernel.hpp  |  28 +++--
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |  78 +++++++++++--
 .../fmha_fwd_splitkv_combine_kernel.hpp       |  39 +++++--
 ...a_fwd_splitkv_combine_tile_partitioner.hpp |  48 --------
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |  40 +++++--
 .../fmha_fwd_splitkv_tile_partitioner.hpp     |  54 ---------
 .../fmha/kernel/fmha_fwd_tile_partitioner.hpp | 105 ------------------
 13 files changed, 171 insertions(+), 277 deletions(-)
 delete mode 100644 include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp
 delete mode 100644 include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
 delete mode 100644 include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp

diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md
index c7ab296c3..e9806e7a6 100644
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -15,8 +15,7 @@ This will result in an executable `build/bin/tile_example_fmha_fwd`
 ## kernel
 The kernel template is `fmha_fwd_kernel.hpp`, this is the grid-wise op in old ck_tile's terminology. We put it here purposely, to demonstrate one can construct a kernel by using various internal component from ck_tile. We may still have an implementation under ck_tile's include path (in the future) for the kernel template.
 
-There are 3 template parameters for this kernel template.
-* `TilePartitioner` is used to map the workgroup to corresponding tile, `fmha_fwd_tile_partitioner.hpp` in this folder served as this purpose.
+There are 2 template parameters for this kernel template.
 * `FmhaPipeline` is one of the block_tile_pipeline(under `include/ck_tile/tile_program/block_tile_pipeline`) which is a performance critical component. Indeed, we did a lot of optimization and trials to optimize the pipeline and may still workout more performance pipeline and update into that folder. People only need to replace this pipeline type and would be able to enjoy the benefit of different performant implementations (stay tuned for updated pipeline(s)).
 * `EpiloguePipeline` will modify and store out the result in the last phase. People usually will do lot of post-fusion at this stage, so we also abstract this concept. Currently we didn't do much thing at the epilogue stage but leave the room for future possible support.
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 66814f5a1..1c9d743f3 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -29,11 +29,6 @@ K0_MAX_SUBMAX_MAP = {
     256: 256
 }
 
-TILE_PARTITIONER_MAP = {
-    "shb" : "ck_tile::FmhaFwdTilePartitioner_SHB",
-    "hbs" : "ck_tile::FmhaFwdTilePartitioner_HBS",
-}
-
 FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
@@ -90,9 +85,7 @@ using fmha_epilogue_{F_idx} =
                                            {F_spad}, {F_dvpad}>>;
 
 using fmha_kernel_{F_idx} =
-    ck_tile::FmhaFwdKernel<{F_tile_partitioner}<fmha_shape_{F_idx}>,
-                  fmha_pipeline_{F_idx},
-                  fmha_epilogue_{F_idx}>;
+    ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
                         {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
@@ -329,12 +322,6 @@ class FmhaFwdKernel:
     F_pipeline      : FmhaFwdPipeline
     mask_impl       : str
 
-    def get_tp(self) -> str:
-        if self.F_mode == 'group':
-            return 'hbs'
-        else:
-            return 'shb'
-
     @property
     def template(self) -> str:
         kernel_body = str()
@@ -374,13 +361,12 @@ class FmhaFwdKernel:
                 F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                 F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
-                F_tile_partitioner = TILE_PARTITIONER_MAP[self.get_tp()])
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
 
     @property
     def name(self) -> str:
         # TODO: we don't encode idx here
-        return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_{self.get_tp()}_" + \
+        return f"fmha_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \
                 self.F_tile.name + '_' + self.F_pipeline.name
 
     @property
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index fb998a33d..2f2081930 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -46,9 +46,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipelineProbl
 using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipeline<
     fmha_pipeline_problem_{F_idx}>;
 
-using fmha_kernel_{F_idx} =
-    ck_tile::FmhaFwdAppendKVKernel<ck_tile::FmhaFwdAppendKVTilePartitioner<{F_bs}, {F_bsk}, {F_bd}, {F_bdv}>,
-                  fmha_pipeline_{F_idx}>;
+using fmha_kernel_{F_idx} = ck_tile::FmhaFwdAppendKVKernel<fmha_pipeline_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout},
                         {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>;
@@ -355,4 +353,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
         _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n")
\ No newline at end of file
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 2f7edd547..fb8a4389f 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -96,9 +96,7 @@ using fmha_epilogue =
                                            {F_spad}, {F_dvpad}>>;
 
 using fmha_kernel =
-    ck_tile::FmhaFwdSplitKVKernel<ck_tile::FmhaFwdSplitKVTilePartitioner<fmha_shape>,
-                  fmha_pipeline,
-                  fmha_epilogue>;
+    ck_tile::FmhaFwdSplitKVKernel<fmha_pipeline, fmha_epilogue>;
 
 static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
@@ -176,11 +174,7 @@ using fmha_epilogue =
                                            false, false>>;
 
 using fmha_kernel =
-    ck_tile::FmhaFwdSplitKVCombineKernel<
-        ck_tile::FmhaFwdSplitKVCombineTilePartitioner<
-            fmha_pipeline_problem::kM0, fmha_pipeline_problem::kN1>,
-        fmha_pipeline,
-        fmha_epilogue>;
+    ck_tile::FmhaFwdSplitKVCombineKernel<fmha_pipeline, fmha_epilogue>;
 
 static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 0e821ed5d..0368de352 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -400,8 +400,18 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
         }
     }();
 
-    dim3 grids = FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v);
-    return ck_tile::make_tuple(kargs, grids);
+    if constexpr(FmhaKernel::kIsGroupMode)
+    {
+        dim3 grids = FmhaKernel::GridSize(
+            args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+    else
+    {
+        dim3 grids =
+            FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false);
+        return ck_tile::make_tuple(kargs, grids);
+    }
 }
 
 template <typename Kernel>
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 7a09e4622..d5920f483 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -14,10 +14,7 @@
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp"
-#include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp"
-#include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp"
-#include "ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_convert_dq.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
index d598f9743..9fec9a320 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
@@ -10,10 +10,9 @@
 
 namespace ck_tile {
 
-template <typename TilePartitioner_, typename FmhaPipeline_>
+template <typename FmhaPipeline_>
 struct FmhaFwdAppendKVKernel
 {
-    using TilePartitioner                         = ck_tile::remove_cvref_t<TilePartitioner_>;
     using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
     static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
     static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
@@ -234,12 +233,25 @@ struct FmhaFwdAppendKVKernel
         return kargs;
     }
 
-    __host__ static constexpr auto GridSize(ck_tile::index_t batch_size,
-                                            ck_tile::index_t nhead,
-                                            ck_tile::index_t seqlen_q,
-                                            ck_tile::index_t seqlen_knew)
+    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
+                                                ck_tile::index_t nhead,
+                                                ck_tile::index_t seqlen_q,
+                                                ck_tile::index_t seqlen_knew)
     {
-        return TilePartitioner::GridSize(batch_size, nhead, seqlen_q, seqlen_knew);
+        // TODO: this may need tuning
+        return dim3(std::max(ck_tile::integer_divide_ceil(seqlen_q, FmhaPipeline::kM0),
+                             ck_tile::integer_divide_ceil(seqlen_knew, FmhaPipeline::kN0)),
+                    nhead,
+                    batch_size);
+    }
+
+    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& /* kargs */)
+    {
+        const index_t i_tile  = blockIdx.x;
+        const index_t i_nhead = blockIdx.y;
+        const index_t i_batch = blockIdx.z;
+
+        return ck_tile::make_tuple(i_tile, i_nhead, i_batch);
     }
 
     __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }
@@ -247,7 +259,7 @@ struct FmhaFwdAppendKVKernel
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
         // divide problem
-        const auto [i_tile, i_nhead, i_batch] = TilePartitioner{}();
+        const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs);
 
         const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
         const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 90102a6c6..f107b10df 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -20,10 +20,9 @@
 
 namespace ck_tile {
 
-template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
+template <typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdKernel
 {
-    using TilePartitioner                         = ck_tile::remove_cvref_t<TilePartitioner_>;
     using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
     using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
     static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
@@ -84,7 +83,7 @@ struct FmhaFwdKernel
             return n.empty() ? n : std::string("p") + n; }();
         return
             _SS_("fmha_fwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
-            "_" + (kIsGroupMode ? "group" : "batch") + "_" + _SS_(TilePartitioner::name) + "_"
+            "_" + (kIsGroupMode ? "group" : "batch") + "_"
             "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
                     _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
             "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
@@ -867,9 +866,75 @@ struct FmhaFwdKernel
     CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_,
                                                 ck_tile::index_t nhead_,
                                                 ck_tile::index_t seqlen_q_,
-                                                ck_tile::index_t hdim_v_)
+                                                ck_tile::index_t hdim_v_,
+                                                bool has_padded_seqlen_k = false)
     {
-        return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
+        // has_padded_seqlen_k is determined by checking (seqlen_k_ptr != nullptr)
+        if(has_padded_seqlen_k)
+        {
+            // TODO: this may need tuning
+            return dim3(nhead_,
+                        batch_size_,
+                        ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1));
+        }
+        else
+        {
+            // TODO: this may need tuning
+            return dim3(ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1),
+                        nhead_,
+                        batch_size_);
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs)
+    {
+        bool has_padded_seqlen_k = false;
+
+        if constexpr(kIsGroupMode)
+            has_padded_seqlen_k = (kargs.seqlen_k_ptr != nullptr);
+
+        if(has_padded_seqlen_k)
+        {
+            // const index_t num_tile_m0 = seqlen_q / kM0;
+            const index_t num_tile_n1 =
+                ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+            const index_t i_block = blockIdx.z;
+            const index_t i_nhead = blockIdx.x;
+            const index_t i_batch = blockIdx.y;
+
+            const auto f = [](index_t dividend, index_t divisor) {
+                index_t quotient = dividend / divisor;
+                index_t modulus  = dividend - quotient * divisor;
+                return ck_tile::make_tuple(quotient, modulus);
+            };
+
+            const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+        }
+        else
+        {
+            // const index_t num_tile_m0 = seqlen_q / kM0;
+            const index_t num_tile_n1 =
+                ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+            const index_t i_block = blockIdx.x;
+            const index_t i_nhead = blockIdx.y;
+            const index_t i_batch = blockIdx.z;
+
+            const auto f = [](index_t dividend, index_t divisor) {
+                index_t quotient = dividend / divisor;
+                index_t modulus  = dividend - quotient * divisor;
+                return ck_tile::make_tuple(quotient, modulus);
+            };
+
+            const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+        }
     }
 
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
@@ -885,8 +950,7 @@ struct FmhaFwdKernel
         __shared__ char smem_ptr[GetSmemSize()];
 
         // divide problem
-        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
-            TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
+        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
         const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
         const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
index a0adfdc12..a342a91f1 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -5,10 +5,9 @@
 
 namespace ck_tile {
 
-template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
+template <typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdSplitKVCombineKernel
 {
-    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
     using FmhaPipeline     = remove_cvref_t<FmhaPipeline_>;
     using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
 
@@ -235,12 +234,35 @@ struct FmhaFwdSplitKVCombineKernel
         return kargs;
     }
 
-    __host__ static constexpr auto GridSize(ck_tile::index_t batch_size,
-                                            ck_tile::index_t nhead,
-                                            ck_tile::index_t max_seqlen_q,
-                                            ck_tile::index_t hdim_v)
+    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
+                                                ck_tile::index_t nhead,
+                                                ck_tile::index_t max_seqlen_q,
+                                                ck_tile::index_t hdim_v)
     {
-        return TilePartitioner::GridSize(batch_size, nhead, max_seqlen_q, hdim_v);
+        // TODO: this may need tuning
+        return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, FmhaPipeline::kM0) *
+                        ck_tile::integer_divide_ceil(hdim_v, FmhaPipeline::kN1),
+                    nhead,
+                    batch_size);
+    }
+
+    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs)
+    {
+        const index_t num_tile_n1 = ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+        const index_t i_block = blockIdx.x;
+        const index_t i_nhead = blockIdx.y;
+        const index_t i_batch = blockIdx.z;
+
+        const auto f = [](index_t dividend, index_t divisor) {
+            index_t quotient = dividend / divisor;
+            index_t modulus  = dividend - quotient * divisor;
+            return ck_tile::make_tuple(quotient, modulus);
+        };
+
+        const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+        return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
     }
 
     __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }
@@ -256,8 +278,7 @@ struct FmhaFwdSplitKVCombineKernel
         __shared__ char smem_ptr[GetSmemSize()];
 
         // divide problem
-        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
-            TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
+        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
         const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
         const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp
deleted file mode 100644
index 3b7390971..000000000
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_tile_partitioner.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-
-template <index_t kM0_, index_t kN1_>
-struct FmhaFwdSplitKVCombineTilePartitioner
-{
-    static constexpr ck_tile::index_t kM0 = kM0_;
-    static constexpr ck_tile::index_t kN1 = kN1_;
-
-    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
-                                                ck_tile::index_t nhead,
-                                                ck_tile::index_t max_seqlen_q,
-                                                ck_tile::index_t hdim_v)
-    {
-        // TODO: this may need tuning
-        return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) *
-                        ck_tile::integer_divide_ceil(hdim_v, kN1),
-                    nhead,
-                    batch_size);
-    }
-
-    CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v)
-    {
-        const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1);
-
-        const index_t i_block = blockIdx.x;
-        const index_t i_nhead = blockIdx.y;
-        const index_t i_batch = blockIdx.z;
-
-        const auto f = [](index_t dividend, index_t divisor) {
-            index_t quotient = dividend / divisor;
-            index_t modulus  = dividend - quotient * divisor;
-            return ck_tile::make_tuple(quotient, modulus);
-        };
-
-        const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
-        return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
-    }
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index dc1748726..10ab25119 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -17,10 +17,9 @@
 
 namespace ck_tile {
 
-template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
+template <typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdSplitKVKernel
 {
-    using TilePartitioner                         = ck_tile::remove_cvref_t<TilePartitioner_>;
     using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
     using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
     static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
@@ -476,13 +475,35 @@ struct FmhaFwdSplitKVKernel
         return kargs;
     }
 
-    __host__ static constexpr auto GridSize(ck_tile::index_t batch_size,
-                                            ck_tile::index_t nhead,
-                                            ck_tile::index_t max_seqlen_q,
-                                            ck_tile::index_t hdim_v,
-                                            ck_tile::index_t num_splits)
+    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
+                                                ck_tile::index_t nhead,
+                                                ck_tile::index_t max_seqlen_q,
+                                                ck_tile::index_t hdim_v,
+                                                ck_tile::index_t num_splits)
     {
-        return TilePartitioner::GridSize(batch_size, nhead, max_seqlen_q, hdim_v, num_splits);
+        // TODO: this may need tuning
+        return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, FmhaPipeline::kM0) *
+                        ck_tile::integer_divide_ceil(hdim_v, FmhaPipeline::kN1) * num_splits,
+                    nhead,
+                    batch_size);
+    }
+
+    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs)
+    {
+        const index_t num_tile_n1 = ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+        const auto f = [](index_t dividend, index_t divisor) {
+            index_t quotient = dividend / divisor;
+            index_t modulus  = dividend - quotient * divisor;
+            return ck_tile::make_tuple(quotient, modulus);
+        };
+
+        const auto [mn, i_split]        = f(blockIdx.x, kargs.num_splits);
+        const auto [i_tile_m, i_tile_n] = f(mn, num_tile_n1);
+        const index_t i_nhead           = blockIdx.y;
+        const index_t i_batch           = blockIdx.z;
+
+        return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch);
     }
 
     __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }
@@ -498,8 +519,7 @@ struct FmhaFwdSplitKVKernel
         __shared__ char smem_ptr[GetSmemSize()];
 
         // divide problem
-        const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] =
-            TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v, kargs.num_splits);
+        const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs);
 
         const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
         const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
deleted file mode 100644
index 5a52fa0f6..000000000
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_tile_partitioner.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-
-template <typename BlockFmhaShape_>
-struct FmhaFwdSplitKVTilePartitioner
-{
-    using BlockFmhaShape = ck_tile::remove_cvref_t<BlockFmhaShape_>;
-
-    static constexpr ck_tile::index_t kM0 = BlockFmhaShape::kM0;
-    static constexpr ck_tile::index_t kN0 = BlockFmhaShape::kN0;
-    static constexpr ck_tile::index_t kK0 = BlockFmhaShape::kK0;
-    static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1;
-    static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1;
-
-    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
-                                                ck_tile::index_t nhead,
-                                                ck_tile::index_t max_seqlen_q,
-                                                ck_tile::index_t hdim_v,
-                                                ck_tile::index_t num_splits)
-    {
-        // TODO: this may need tuning
-        return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, kM0) *
-                        ck_tile::integer_divide_ceil(hdim_v, kN1) * num_splits,
-                    nhead,
-                    batch_size);
-    }
-
-    CK_TILE_DEVICE auto
-    operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v, ck_tile::index_t num_splits)
-    {
-        const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1);
-
-        const auto f = [](index_t dividend, index_t divisor) {
-            index_t quotient = dividend / divisor;
-            index_t modulus  = dividend - quotient * divisor;
-            return ck_tile::make_tuple(quotient, modulus);
-        };
-
-        const auto [mn, i_split]        = f(blockIdx.x, num_splits);
-        const auto [i_tile_m, i_tile_n] = f(mn, num_tile_n1);
-        const index_t i_nhead           = blockIdx.y;
-        const index_t i_batch           = blockIdx.z;
-
-        return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch);
-    }
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp
deleted file mode 100644
index 2dca84b78..000000000
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_tile_partitioner.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-
-template <typename BlockFmhaShape_>
-struct FmhaFwdTilePartitioner
-{
-    using BlockFmhaShape = ck_tile::remove_cvref_t<BlockFmhaShape_>;
-
-    static constexpr ck_tile::index_t kM0 = BlockFmhaShape::kM0;
-    static constexpr ck_tile::index_t kN0 = BlockFmhaShape::kN0;
-    static constexpr ck_tile::index_t kK0 = BlockFmhaShape::kK0;
-    static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1;
-    static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1;
-
-    static constexpr const char* name = "shb";
-
-    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_,
-                                                ck_tile::index_t nhead_,
-                                                ck_tile::index_t seqlen_q_,
-                                                ck_tile::index_t hdim_v_)
-    {
-        // TODO: this may need tuning
-        return dim3(ck_tile::integer_divide_ceil(seqlen_q_, kM0) *
-                        ck_tile::integer_divide_ceil(hdim_v_, kN1),
-                    nhead_,
-                    batch_size_);
-    }
-
-    CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v)
-    {
-        // const index_t num_tile_m0 = seqlen_q / kM0;
-        const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1);
-
-        const index_t i_block = blockIdx.x;
-        const index_t i_nhead = blockIdx.y;
-        const index_t i_batch = blockIdx.z;
-
-        const auto f = [](index_t dividend, index_t divisor) {
-            index_t quotient = dividend / divisor;
-            index_t modulus  = dividend - quotient * divisor;
-            return ck_tile::make_tuple(quotient, modulus);
-        };
-
-        const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
-        return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
-    }
-};
-
-template <typename BlockFmhaShape_>
-using FmhaFwdTilePartitioner_SHB = FmhaFwdTilePartitioner<BlockFmhaShape_>;
-
-template <typename BlockFmhaShape_>
-struct FmhaFwdTilePartitioner_HBS
-{
-    using BlockFmhaShape = ck_tile::remove_cvref_t<BlockFmhaShape_>;
-
-    static constexpr ck_tile::index_t kM0 = BlockFmhaShape::kM0;
-    static constexpr ck_tile::index_t kN0 = BlockFmhaShape::kN0;
-    static constexpr ck_tile::index_t kK0 = BlockFmhaShape::kK0;
-    static constexpr ck_tile::index_t kN1 = BlockFmhaShape::kN1;
-    static constexpr ck_tile::index_t kK1 = BlockFmhaShape::kK1;
-
-    static constexpr const char* name = "hbs";
-
-    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_,
-                                                ck_tile::index_t nhead_,
-                                                ck_tile::index_t seqlen_q_,
-                                                ck_tile::index_t hdim_v_)
-    {
-        // TODO: this may need tuning
-        return dim3(nhead_,
-                    batch_size_,
-                    ck_tile::integer_divide_ceil(seqlen_q_, kM0) *
-                        ck_tile::integer_divide_ceil(hdim_v_, kN1));
-    }
-
-    CK_TILE_DEVICE auto operator()(ck_tile::index_t /*seqlen_q*/, ck_tile::index_t hdim_v)
-    {
-        // const index_t num_tile_m0 = seqlen_q / kM0;
-        const index_t num_tile_n1 = ck_tile::integer_divide_ceil(hdim_v, kN1);
-
-        const index_t i_block = blockIdx.z;
-        const index_t i_nhead = blockIdx.x;
-        const index_t i_batch = blockIdx.y;
-
-        const auto f = [](index_t dividend, index_t divisor) {
-            index_t quotient = dividend / divisor;
-            index_t modulus  = dividend - quotient * divisor;
-            return ck_tile::make_tuple(quotient, modulus);
-        };
-
-        const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
-        return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
-    }
-};
-
-} // namespace ck_tile
-- 
GitLab


From 159fa31946191747eed397abfa23a1910a85de67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 1 Jan 2025 18:00:06 +0100
Subject: [PATCH 144/153] Add NGCHW bf16 grouped conv fwd instances (#1783)

* Add NGCHW bf16 grouped conv fwd instances

* add missed cmake
---
 .../gpu/grouped_convolution_forward.hpp       | 18 ++++++-
 .../grouped_convolution_forward_comp_xdl.inc  | 16 +++++++
 ...uped_convolution_forward_mem_inter_xdl.inc | 16 +++++++
 ...uped_convolution_forward_mem_intra_xdl.inc | 16 +++++++
 .../gpu/grouped_convolution_forward_xdl.inc   | 16 +++++++
 ..._convolution_forward_xdl_merged_groups.inc | 14 ++++++
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |  5 ++
 ...l_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp | 39 +++++++++++++++
 ...wd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp | 38 +++++++++++++++
 ...hw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp | 39 +++++++++++++++
 ...hw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp | 39 +++++++++++++++
 ...groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp | 48 +++++++++++++++++++
 .../test_grouped_convnd_fwd.cpp               |  1 +
 13 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 8090b2449..01415c2dd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -304,7 +304,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
             }
 #endif
-
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
 #ifdef CK_ENABLE_INT8
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<OutDataType, int8_t> && is_same_v<AComputeType, int8_t> &&
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
index e47a876e1..9a83e36b9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
@@ -90,6 +90,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
index f74622ad4..662fadadc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
@@ -90,6 +90,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
index 81737b614..f283fe855 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
@@ -90,6 +90,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index 4cb2aae09..c977c89c9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -204,6 +204,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
index 1bd2697b9..a81e1e07b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
@@ -23,6 +23,20 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_inst
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 98bee66a9..146916cfd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -11,6 +11,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -27,6 +28,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -42,10 +44,12 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
@@ -56,6 +60,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
new file mode 100644
index 000000000..65e233ce0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                        NGCHW,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
new file mode 100644
index 000000000..6ee6aa1e4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NGCHW,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
new file mode 100644
index 000000000..88b5f30da
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NGCHW,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NGKHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
new file mode 100644
index 000000000..48cca9c3f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NGCHW,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NGKHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp
new file mode 100644
index 000000000..14f00d8e8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                 NGCHW,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                 NGCHW,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
index 1abd4fd9f..25481e0d7 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -64,6 +64,7 @@ using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
                                        std::tuple<int8_t, NHWGC, GKYXC, NHWGK>,
                                        std::tuple<float, NGCHW, GKYXC, NGKHW>,
                                        std::tuple<ck::half_t, NGCHW, GKYXC, NGKHW>,
+                                       std::tuple<ck::bhalf_t, NGCHW, GKYXC, NGKHW>,
                                        std::tuple<int8_t, NGCHW, GKYXC, NGKHW>>;
 
 using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
-- 
GitLab


From 1d8e4ec2ced2da813947e89654f69f7bf6b5079e Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Thu, 2 Jan 2025 04:48:06 +0100
Subject: [PATCH 145/153] Jing's contribution: prototype of mixed precision
 gemm FP16/BF16xint4 GEMM (#1762)

* add a prototype of int4

* clean

* debug

* clean

* clean

* move packed into dynamic_buffer

* fixed coord reset

* add fast pki4 to half conversion

* fix

* fixed reference and host_tensor

* fixed tensor init

* format

* debug i4_to_f16_convert

* format

* fixed splitk

* weight permute

* add b tile permute

* clean

* weight permute with splitki

* format

* improve weight layout

* add and_or_b32

* fixed splitk crush

* add permute switch as a template

* recover v3r1

* clean

* failure with intrawave v2

* fixed

* fixed

* add ckProfiler

* add bfp16 support

* add bf16 example

* fixed int4 to bhalf_t conversion

* format

* fixed int4 to bf16 conversion

* clean

* add instances for mem

* clean

* fixed host tensor size

* fixed

* debug

* fixed

* add pk_i4_t as a struct

* fix

* Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* revert

* Update example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* fixed comments

* revert

* clean

* revert

* revert

* fixed

* Update CMakeLists.txt

* Update script/cmake-ck-dev.sh

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Update CMakeLists.txt

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* fixed

* fixed

* fixed

* revert

* revert

* add comments

* format

* fixed assert

* fixed

* Fix I4 define in ckProfiler

* Fixed example_gemm_xdl_bf16_pk_i4_v3 test failed issue

---------

Co-authored-by: Jing Zhang <jizhan@fb.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
Co-authored-by: mtgu0705 <mtgu@amd.com>
---
 CMakeLists.txt                                |   2 +-
 cmake/EnableCompilerWarnings.cmake            |   2 +-
 example/01_gemm/CMakeLists.txt                |   2 +
 example/01_gemm/common.hpp                    |  82 +++++
 example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp    | 253 +++++++++++++++
 example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp      |  16 +-
 example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp    | 303 ++++++++++++++++++
 example/01_gemm/gemm_xdl_fp16_v3.cpp          |  20 +-
 example/01_gemm/run_gemm_example.inc          |  82 -----
 .../01_gemm/run_gemm_example_streamk_v2.inc   |  82 -----
 example/01_gemm/run_gemm_example_v2.inc       |  82 -----
 include/ck/library/utility/host_tensor.hpp    |  65 +++-
 .../library/utility/host_tensor_generator.hpp |  30 ++
 include/ck/tensor/static_tensor.hpp           |   4 +-
 .../gpu/device/device_gemm_v2.hpp             |   4 +
 .../impl/device_gemm_xdl_cshuffle_v3.hpp      |  13 +-
 .../element/unary_element_wise_operation.hpp  | 189 +++++++++++
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    | 104 ++++--
 .../threadwise_tensor_slice_transfer.hpp      |  50 ++-
 .../threadwise_tensor_slice_transfer_v3r1.hpp |  74 +++--
 include/ck/utility/amd_buffer_addressing.hpp  |   3 +-
 include/ck/utility/amd_inline_asm.hpp         |  21 ++
 include/ck/utility/data_type.hpp              |  35 ++
 include/ck/utility/dynamic_buffer.hpp         |   6 +-
 include/ck/utility/static_buffer.hpp          |   6 +-
 .../cpu/reference_gemm.hpp                    |  22 ++
 .../device_operation_instance_factory.hpp     |   1 +
 .../gpu/gemm_universal.hpp                    |  33 ++
 .../gpu/gemm_universal/CMakeLists.txt         |   3 +
 ...mm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp |  87 +++++
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |  24 ++
 ...gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp |  86 +++++
 ...4_f16_mk_nk_mn_mem_v2_default_instance.cpp |  24 ++
 .../profiler/profile_gemm_universal_impl.hpp  | 103 +++++-
 profiler/src/CMakeLists.txt                   |   1 -
 profiler/src/profile_gemm_universal.cpp       |  17 +-
 script/cmake-ck-dev.sh                        |   2 +-
 37 files changed, 1583 insertions(+), 350 deletions(-)
 create mode 100644 example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
 create mode 100644 example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be4efd3df..6d4176735 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -585,7 +585,7 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
    )
    add_subdirectory(example)
    if(BUILD_TESTING)
-	   add_subdirectory(test)
+       add_subdirectory(test)
    endif()
 endif()
 
diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
index 93fd306e9..fb2b38d68 100644
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -66,7 +66,7 @@ else()
             -Wunreachable-code
             -Wunused
             -Wno-reserved-identifier
-	    -Werror
+            -Werror
             -Wno-option-ignored
             -Wsign-compare
             -Wno-extra-semi-stmt
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 957acce16..df7be0466 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -29,6 +29,8 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v3)
 add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_v3)
 add_example_executable(example_gemm_xdl_fp16_fp8_v3 gemm_xdl_fp16_fp8_v3.cpp)
+add_example_executable(example_gemm_xdl_fp16_pk_i4_v3 gemm_xdl_fp16_pk_i4_v3.cpp)
+add_example_executable(example_gemm_xdl_bf16_pk_i4_v3 gemm_xdl_bf16_pk_i4_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
 add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index a3a62d4cf..9664c50b6 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -287,3 +287,85 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
 
     return true;
 }
+
+template <typename DataType>
+inline __host__ __device__ constexpr double get_rtol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 1e-1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 1.5e-1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+
+template <typename DataType>
+inline __host__ __device__ constexpr double get_atol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 16.1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 8192.1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
diff --git a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
new file mode 100644
index 000000000..7b491173a
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::pk_i4_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault      = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr bool PermuteA         = false;
+static constexpr bool PermuteB         = true;
+static constexpr ck::index_t KPerBlock = 128;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        128,
+        16, 64,
+        KPerBlock, 8, 32,
+        16,   16,
+        1,    2,
+        S<16, 8, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<4, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 32, 32, 0,
+        1, 1, S<1, 16, 1, 8>, 4,
+        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>;
+
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      KBatch,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
index 2e27fc66f..b0e36b394 100644
--- a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
 
-using ADataType        = ck::f8_t;
-using BDataType        = ck::half_t;
+using ADataType        = ck::half_t;
+using BDataType        = ck::f8_t;
 using AccDataType      = float;
 using CShuffleDataType = ck::half_t;
 using CDataType        = ck::half_t;
@@ -29,15 +29,15 @@ using DeviceGemmV2Instance =
         AElementOp, BElementOp, CElementOp, GemmDefault, 
         64,
         16, 16, 
-        64, 16, 8,
+        256, 8, 16,
         16,   16,
         1,    1, 
-        S<4, 16, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
-        2, 16, 16, 0,
-        S<8, 8, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        S<32, 2, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
         2, 8, 8, 0,
+        S<16, 4, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 16, 16, 0,
         1, 1, S<1, 16, 1, 4>, 4,
-        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v1>;
+        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
new file mode 100644
index 000000000..e8a3064de
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::pk_i4_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA         = false;
+static constexpr bool PermuteB         = true;
+static constexpr ck::index_t KPerBlock = 128;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        128,
+        16, 128,
+        KPerBlock, 8, 32,
+        16,   16,
+        1,    4,
+        S<16, 8, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<4, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 32, 32, 0,
+        1, 1, S<1, 16, 1, 8>, 4,
+        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>;
+
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int hi   = input[2];
+                int lo   = input[0];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 0, i) = i4x2;
+            }
+
+            {
+                int hi   = input[6];
+                int lo   = input[4];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 2, i) = i4x2;
+            }
+
+            {
+                int hi   = input[3];
+                int lo   = input[1];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 4, i) = i4x2;
+            }
+
+            {
+                int hi   = input[7];
+                int lo   = input[5];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 6, i) = i4x2;
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      KBatch,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_fp16_v3.cpp b/example/01_gemm/gemm_xdl_fp16_v3.cpp
index ad370f570..4a969246c 100644
--- a/example/01_gemm/gemm_xdl_fp16_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_v3.cpp
@@ -12,7 +12,7 @@ using CShuffleDataType = ck::half_t;
 using CDataType        = ck::half_t;
 
 using ALayout = Row;
-using BLayout = Row;
+using BLayout = Col;
 using CLayout = Row;
 
 using AElementOp = PassThrough;
@@ -27,17 +27,17 @@ using DeviceGemmV2Instance =
         ALayout,   BLayout,  CLayout,   
         ADataType,   BDataType,  CDataType,  AccDataType,  CShuffleDataType, 
         PassThrough, PassThrough, PassThrough, GemmDefault, 
-        256,
-        224, 256, 
-        64, 8, 2,
+        64,
+        16, 16, 
+        256, 8, 8,
         16,   16,
-        7,    8,
-        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        1,    1,
+        S<32, 2, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
         2, 8, 8, 0,
-        S<8, 32, 1>,  S<0, 2, 1>,  S<0, 2, 1>, 
-        1, 8, 2, 0,
-        1, 2, S<1, 32, 1, 8>, 8,
-        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
+        S<32, 2, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 8, 8, 0,
+        1, 1, S<1, 16, 1, 4>, 4,
+        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index 3ee6e2685..4371af624 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -5,88 +5,6 @@
 
 #include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
 
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 2e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 2e-1;
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {
diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc
index 04243b829..9ee380d24 100755
--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -3,88 +3,6 @@
 
 #pragma once
 
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 1e-1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 1.5e-1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 16.1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 8192.1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {
diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc
index 5b6969f1d..2b60fa5d2 100644
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -3,88 +3,6 @@
 
 #pragma once
 
-template <typename DataType>
-inline __host__ __device__ constexpr double get_rtol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 1e-1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 1.5e-1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
-template <typename DataType>
-inline __host__ __device__ constexpr double get_atol()
-{
-    if constexpr(std::is_same_v<DataType, float>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, double>)
-    {
-        return 1e-6;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::half_t>)
-    {
-        return 1e-3;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
-    {
-        return 5e-2;
-    }
-    else if constexpr(std::is_same_v<DataType, int32_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, int8_t>)
-    {
-        return 1e-1;
-    }
-    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
-    {
-        return 16.1; // 240 and 224 are acceptable
-    }
-    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
-    {
-        return 8192.1; // 57344 and 49152 are acceptable
-    }
-    else
-    {
-        return 1e-3;
-    }
-}
-
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 18e1db462..ef5738be0 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -266,18 +266,18 @@ struct Tensor
     using Data       = std::vector<T>;
 
     template <typename X>
-    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
+    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(GetElementSpaceSize())
     {
     }
 
     template <typename X, typename Y>
     Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
-        : mDesc(lens, strides), mData(mDesc.GetElementSpaceSize())
+        : mDesc(lens, strides), mData(GetElementSpaceSize())
     {
     }
 
     template <typename Lengths>
-    Tensor(const Lengths& lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
+    Tensor(const Lengths& lens) : mDesc(lens), mData(GetElementSpaceSize())
     {
     }
 
@@ -287,7 +287,7 @@ struct Tensor
     {
     }
 
-    Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {}
+    Tensor(const Descriptor& desc) : mDesc(desc), mData(GetElementSpaceSize()) {}
 
     template <typename OutT>
     Tensor<OutT> CopyAsType() const
@@ -322,7 +322,17 @@ struct Tensor
 
     std::size_t GetElementSize() const { return mDesc.GetElementSize(); }
 
-    std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); }
+    std::size_t GetElementSpaceSize() const
+    {
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return (mDesc.GetElementSpaceSize() + 1) / 2;
+        }
+        else
+        {
+            return mDesc.GetElementSpaceSize();
+        }
+    }
 
     std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
 
@@ -469,29 +479,64 @@ struct Tensor
     template <typename... Is>
     std::size_t GetOffsetFromMultiIndex(Is... is) const
     {
-        return mDesc.GetOffsetFromMultiIndex(is...);
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mDesc.GetOffsetFromMultiIndex(is...) / 2;
+        }
+        else
+        {
+            return mDesc.GetOffsetFromMultiIndex(is...);
+        }
     }
 
     template <typename... Is>
     T& operator()(Is... is)
     {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        }
     }
 
     template <typename... Is>
     const T& operator()(Is... is) const
     {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        }
     }
 
     T& operator()(std::vector<std::size_t> idx)
     {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        }
     }
 
     const T& operator()(std::vector<std::size_t> idx) const
     {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
+        }
+        else
+        {
+            return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        }
     }
 
     typename Data::iterator begin() { return mData.begin(); }
diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp
index ab9f01b53..6a90523c3 100644
--- a/include/ck/library/utility/host_tensor_generator.hpp
+++ b/include/ck/library/utility/host_tensor_generator.hpp
@@ -81,6 +81,20 @@ struct GeneratorTensor_1<int8_t>
     }
 };
 
+template <>
+struct GeneratorTensor_1<ck::pk_i4_t>
+{
+    int8_t value = 1;
+
+    template <typename... Is>
+    ck::pk_i4_t operator()(Is...)
+    {
+        int t         = value + 8;
+        ck::pk_i4_t r = ((t << 4) + t) & 0xff;
+        return r;
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_2
 {
@@ -121,6 +135,22 @@ struct GeneratorTensor_2<int8_t>
     }
 };
 
+template <>
+struct GeneratorTensor_2<ck::pk_i4_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::pk_i4_t operator()(Is...)
+    {
+        int hi        = std::rand() % (max_value - min_value) + min_value + 8;
+        int lo        = std::rand() % (max_value - min_value) + min_value + 8;
+        ck::pk_i4_t r = ((hi << 4) + lo) & 0xff;
+        return r;
+    }
+};
+
 #if defined CK_ENABLE_FP8
 template <>
 struct GeneratorTensor_2<ck::f8_t>
diff --git a/include/ck/tensor/static_tensor.hpp b/include/ck/tensor/static_tensor.hpp
index d719ef976..ef2bedd65 100644
--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -167,7 +167,7 @@ struct StaticTensorTupleOfVectorBuffer
     // Idx is for S, not X. Idx should be aligned with X
     template <typename X,
               typename Idx,
-              typename enable_if<has_same_scalar_type<S, X>::value &&
+              typename enable_if<(has_same_scalar_type<S, X>::value || !is_native_type<S>()) &&
                                      is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
     __host__ __device__ constexpr X GetAsType(Idx) const
@@ -201,7 +201,7 @@ struct StaticTensorTupleOfVectorBuffer
     // Idx is for S, not X. Idx should be aligned with X
     template <typename X,
               typename Idx,
-              typename enable_if<has_same_scalar_type<S, X>::value &&
+              typename enable_if<(has_same_scalar_type<S, X>::value || !is_native_type<S>()) &&
                                      is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
     __host__ __device__ constexpr void SetAsType(Idx, X x)
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
index b2db35b15..43909f77d 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
@@ -36,6 +36,10 @@ struct DeviceGemmV2 : public BaseOperator
                         CElementwiseOperation c_element_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual bool GetPermuteA()         = 0;
+    virtual bool GetPermuteB()         = 0;
+    virtual ck::index_t GetKPerBlock() = 0;
 };
 
 template <typename ALayout,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index ad6aa1e7c..600f12139 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -64,7 +64,9 @@ template <typename ALayout,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
           typename ComputeTypeA                       = CDataType,
-          typename ComputeTypeB                       = ComputeTypeA>
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
 struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                                                        BLayout,
                                                        CLayout,
@@ -122,7 +124,9 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
         BlkGemmPipeSched,
         BlkGemmPipelineVer,
         ComputeTypeA,
-        ComputeTypeB>;
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
 
     using Argument = typename GridwiseGemm::Argument;
 
@@ -633,6 +637,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
         return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
     }
 
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteA() override { return PermuteA; }
+    bool GetPermuteB() override { return PermuteB; }
+
     static auto MakeArgument(const ADataType* p_a,
                              const BDataType* p_b,
                              CDataType* p_c,
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 39b81ca57..86a5af41b 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -7,12 +7,177 @@
 #include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
 #include "ck/utility/type_convert.hpp"
+#include "ck/utility/amd_inline_asm.hpp"
 #include <cassert>
 
 namespace ck {
+
+// Fast int4x4 to half8_t data type conversion based on paper
+// [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production]
+// (https://arxiv.org/abs/2211.10017) and implementation:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__host__ __device__ inline half4_t pki4_to_half4(int q)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    // Extract the two int4 at low bit and create two fp16 number.
+    int lo = amd_assembly_and_or_b32(q, LO, EX);
+    // Extract the two int4 at hight bit and create two fp16 number.
+    int hi = amd_assembly_and_or_b32(q, HI, EX);
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    vector_type<half_t, 4> res;
+
+    // for two fp16 from lowbit, subtract 1032 to get correct fp16 value
+    res.template AsType<half2_t>()(Number<0>{}) =
+        amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));
+
+    // for two fp16 from highbit, divide 16 and subtract 72 to get correct fp16 value
+    res.template AsType<half2_t>()(Number<1>{}) = amd_assembly_pk_fma_f16(
+        bit_cast<half2_t>(hi), bit_cast<half2_t>(MUL), bit_cast<half2_t>(ADD));
+
+    return res.template AsType<half4_t>()[Number<0>{}];
+}
+
+__host__ __device__ inline half2_t pki4_to_half2(pk_i4_t q)
+{
+#if 1
+    uint8_t x_u8 = ck::bit_cast<uint8_t>(q);
+    uint32_t i4s = ((x_u8 & 0x0f) << 16) | ((x_u8 & 0xf0) >> 4);
+
+    const int EX  = 0x64006400;
+    const int SUB = 0xE408E408; //-8
+
+    int lo = i4s | EX;
+
+    return amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));
+#else
+    uint8_t x_u8 = ck::bit_cast<uint8_t>(q);
+
+    vector_type<half_t, 2> res;
+
+    half_t x_h = (x_u8 & 0x0f) - 8;
+    half_t x_l = ((x_u8 & 0xf0) >> 4) - 8;
+
+    res.template AsType<half_t>()(Number<0>{}) = x_l;
+    res.template AsType<half_t>()(Number<1>{}) = x_h;
+
+    return res.template AsType<half2_t>()[Number<0>{}];
+#endif
+}
+
+__host__ __device__ inline bhalf4_t pki4_to_bhalf4(int q)
+{
+    uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12);
+
+    static constexpr uint32_t fp32_base = 0x4B000000;
+
+    float fp32_intermediates[4];
+
+    uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+    fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650);
+    fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7651);
+    fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7652);
+    fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653);
+
+    fp32_intermediates[0] -= 8388616.f;
+    fp32_intermediates[1] -= 8388616.f;
+    fp32_intermediates[2] -= 8388616.f;
+    fp32_intermediates[3] -= 8388616.f;
+
+    vector_type<bhalf_t, 4> res;
+    res.template AsType<bhalf2_t>()(Number<0>{}) = bit_cast<bhalf2_t>(
+        __byte_perm(fp32_intermediates_casted[1], fp32_intermediates_casted[0], 0x7632));
+    res.template AsType<bhalf2_t>()(Number<1>{}) = bit_cast<bhalf2_t>(
+        __byte_perm(fp32_intermediates_casted[3], fp32_intermediates_casted[2], 0x7632));
+
+    return res.template AsType<bhalf4_t>()[Number<0>{}];
+}
+
+__host__ __device__ inline bhalf2_t pki4_to_bhalf2(pk_i4_t q)
+{
+    uint8_t x_u8 = ck::bit_cast<uint8_t>(q);
+
+    float x_h = ((x_u8 & 0x0f) >> 0) - 8.f;
+    float x_l = ((x_u8 & 0xf0) >> 4) - 8.f;
+
+    vector_type<bhalf_t, 2> res;
+
+    res.template AsType<bhalf_t>()(Number<0>{}) = type_convert<bhalf_t>(x_l);
+    res.template AsType<bhalf_t>()(Number<1>{}) = type_convert<bhalf_t>(x_h);
+
+    return res.template AsType<bhalf2_t>()[Number<0>{}];
+}
+
 namespace tensor_operation {
 namespace element_wise {
 
+struct PassThroughPack8
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    __host__ __device__ constexpr void operator()(ck::half8_t& y, const ck::pk_i4x4_t& x) const
+    {
+#if 1
+        vector_type<half_t, 8> result;
+
+        result.template AsType<half4_t>()(Number<0>{}) = pki4_to_half4(bit_cast<int>(x));
+        result.template AsType<half4_t>()(Number<1>{}) = pki4_to_half4(bit_cast<int>(x) >> 8);
+
+        y = result.template AsType<half8_t>()[Number<0>{}];
+#else
+        vector_type<half_t, 8> dst;
+        vector_type<pk_i4_t, 4> src{x};
+
+        dst.template AsType<half2_t>()(Number<0>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<0>{}]);
+        dst.template AsType<half2_t>()(Number<1>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<1>{}]);
+        dst.template AsType<half2_t>()(Number<2>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<2>{}]);
+        dst.template AsType<half2_t>()(Number<3>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<3>{}]);
+
+        y = dst.template AsType<half8_t>()[Number<0>{}];
+#endif
+    }
+
+    __host__ __device__ constexpr void operator()(ck::bhalf8_t& y, const ck::pk_i4x4_t& x) const
+    {
+#if 1
+        vector_type<bhalf_t, 8> result;
+
+        result.template AsType<bhalf4_t>()(Number<0>{}) = pki4_to_bhalf4(bit_cast<int>(x));
+        result.template AsType<bhalf4_t>()(Number<1>{}) = pki4_to_bhalf4(bit_cast<int>(x) >> 16);
+
+        y = result.template AsType<bhalf8_t>()[Number<0>{}];
+#else
+        vector_type<bhalf_t, 8> dst;
+        vector_type<pk_i4_t, 4> src{x};
+
+        dst.template AsType<bhalf2_t>()(Number<0>{}) =
+            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<0>{}]);
+        dst.template AsType<bhalf2_t>()(Number<1>{}) =
+            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<1>{}]);
+        dst.template AsType<bhalf2_t>()(Number<2>{}) =
+            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<2>{}]);
+        dst.template AsType<bhalf2_t>()(Number<3>{}) =
+            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<3>{}]);
+
+        y          = dst.template AsType<bhalf8_t>()[Number<0>{}];
+#endif
+    }
+
+    constexpr const static bool is_pack8_invocable = true;
+};
+
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wnon-virtual-dtor"
 struct UnaryOpBase
@@ -49,6 +214,24 @@ struct PassThroughPack2
         auto t = type_convert<float2_t>(x);
         y      = type_convert<half2_t>(t);
     }
+
+    __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
+    {
+#if 1
+        uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
+        uint8_t x_l  = (x_u8 & 0x0f) >> 0;
+        uint8_t x_h  = (x_u8 & 0xf0) >> 4;
+
+        auto l_f16 = ck::type_convert<ck::half_t>(x_l);
+        auto h_f16 = ck::type_convert<ck::half_t>(x_h);
+
+        y = {l_f16, h_f16};
+#else
+        uint32_t t = ck::bit_cast<uint8_t>(x);
+        y          = ck::bit_cast<half2_t>(t);
+#endif
+    }
+
     constexpr const static bool is_pack2_invocable = true;
 };
 
@@ -76,6 +259,12 @@ struct PassThrough final : public UnaryOpBase
     template <typename Y, typename X>
     __host__ __device__ void operator()(Y& y, const X& x) const;
 
+    template <>
+    __host__ __device__ void operator()<pk_i4_t, pk_i4_t>(pk_i4_t& y, const pk_i4_t& x) const
+    {
+        y = x;
+    }
+
     template <>
     __host__ __device__ void operator()<float, double>(float& y, const double& x) const
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 36797a906..a43f0f880 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -127,7 +127,9 @@ template <typename ALayout,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
           typename ComputeTypeA                       = CDataType,
-          typename ComputeTypeB                       = ComputeTypeA>
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
 struct GridwiseGemm_xdl_cshuffle_v3
 {
     static constexpr auto I0 = Number<0>{};
@@ -151,6 +153,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
     {
         return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
@@ -319,6 +335,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
         using GemmSpecialization = tensor_operation::device::GemmSpecialization;
 
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
         if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
                      GemmSpec == GemmSpecialization::MNKPadding)
         {
@@ -373,15 +393,39 @@ struct GridwiseGemm_xdl_cshuffle_v3
         }
         else
         {
-            // not pad N or K
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                               make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+                return b_grid_desc_bk0_n_bk1;
+            }
+            else
+            {
+                // Pre-shuffled Weight
+                // BGlobal[K / KPerBlock, N, KPerBlock / K1, K1] -> BTile[K / K1, N, K1]
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                const index_t BK0_     = StrideB / BK1Value;
+                const index_t BK00     = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
         }
     }
 
@@ -572,7 +616,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
         {
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
             {
-                a_k_split_offset = blockIdx.z * karg.KRead;
+                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
             {
@@ -585,7 +629,15 @@ struct GridwiseGemm_xdl_cshuffle_v3
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
-                b_k_split_offset = blockIdx.z * karg.KRead;
+                if constexpr(!PermuteB)
+                {
+                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                }
             }
 
             if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
@@ -625,9 +677,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
         // in some cases.
         else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
-            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
-                                                  ? 1
-                                                  : 32 * 4 / KPerBlock / sizeof(ADataType);
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
+            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
             constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
                 make_tuple(
                     AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
@@ -761,10 +812,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
         else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
         {
             // NLdsLayer * K0 as logical Bank
-            constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
-                                           ? 1
-                                           : 32 * 4 / KPerBlock / sizeof(BDataType);
-            ;
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
+            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
             constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
                 make_tuple(
                     BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
@@ -946,8 +995,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
         constexpr auto c_block_size =
             c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
 
-        return math::max((a_block_space_size_aligned * sizeof(ADataType) +
-                          b_block_space_size_aligned * sizeof(BDataType)),
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
+                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
                          c_block_size * sizeof(CShuffleDataType));
     }
 
@@ -1312,8 +1361,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
             static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(p_shared) +
-                a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType),
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
+                                                                            sizeof(ADataType) /
+                                                                            APackedSize),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
@@ -1706,16 +1756,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
             static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(p_shared_0) +
-                a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType),
+            bit_cast<BDataType*>(static_cast<char*>(p_shared_0) +
+                                 a_block_space_size_aligned * sizeof(ADataType)),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
             static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(p_shared_1) +
-                a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType),
+            bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
+                                 a_block_space_size_aligned * sizeof(ADataType)),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index d7a6a3624..758900200 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1007,6 +1007,13 @@ struct ThreadwiseTensorSliceTransfer_v4
 
     using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
+    static constexpr index_t PackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
         : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
     {
@@ -1015,6 +1022,11 @@ struct ThreadwiseTensorSliceTransfer_v4
 
         static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
                       "wrong! Not divisible");
+
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        {
+            static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
+        }
     }
 
     template <typename SrcRefToOriginDisplacement,
@@ -1109,7 +1121,7 @@ struct ThreadwiseTensorSliceTransfer_v4
 
             move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
 
-            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+            vector_type_maker_t<SrcData, SrcScalarPerVector / PackedSize> src_tmp_vector;
 
             using src_vector_t = typename decltype(src_tmp_vector)::type;
 
@@ -1120,7 +1132,8 @@ struct ThreadwiseTensorSliceTransfer_v4
             if constexpr(SrcBuffer::IsDynamicBuffer())
             {
                 src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
-                    src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
+                    src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / PackedSize,
+                                                       is_src_valid);
             }
             else if constexpr(SrcBuffer::IsStaticBuffer())
             {
@@ -1133,9 +1146,36 @@ struct ThreadwiseTensorSliceTransfer_v4
                 });
             }
 
-            if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
-                         is_same<remove_cvref_t<DstData>, half_t>::value &&
-                         SrcScalarPerVector % 2 == 0)
+            if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
+            {
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+
+                constexpr index_t pack_size = 8;
+
+                static_assert(SrcScalarPerVector % pack_size == 0, "");
+
+                using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
+                using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
+
+                static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
+                    ck::tensor_operation::element_wise::PassThroughPack8{}(
+                        dst_tmp_vector.template AsType<dst_v_t>()(i),
+                        src_tmp_vector.template AsType<src_v_t>()[i]);
+                });
+
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
+            }
+            else if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
+                              is_same<remove_cvref_t<DstData>, half_t>::value &&
+                              SrcScalarPerVector % 2 == 0)
             {
                 // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
                 // DstData)
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
index 96ea04c8f..8cbe6bd2c 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -31,8 +31,8 @@ template <typename SliceLengths,
           typename DstDimAccessOrder,
           index_t SrcVectorDim,
           index_t DstVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstScalarPerVector,
+          index_t SrcScalarPerVector_,
+          index_t DstScalarPerVector_,
           index_t SrcScalarStrideInVector,
           index_t DstScalarStrideInVector,
           bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
@@ -55,6 +55,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
 
     static constexpr auto I0 = Number<0>{};
 
+    static constexpr index_t PackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr auto SrcScalarPerVector = Number<SrcScalarPerVector_ / PackedSize>{};
+    static constexpr auto DstScalarPerVector = Number<DstScalarPerVector_ / PackedSize>{};
+
     __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(
         const SrcDesc& src_desc,
         const Index& src_slice_origin,
@@ -67,6 +77,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
           src_element_op_(src_element_op),
           dst_element_op_(dst_element_op)
     {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        {
+            static_assert(is_same_v<remove_cvref_t<SrcData>, remove_cvref_t<DstData>>,
+                          "SrcData != DstData");
+
+            static_assert(
+                SrcScalarPerVector_ % PackedSize == 0 && DstScalarPerVector_ % PackedSize == 0,
+                "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type");
+
+            static_assert(SrcVectorDim == DstVectorDim, "pk_i4_t does not support transpose");
+        }
     }
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -95,11 +116,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         // scalar per access on each dim
         // TODO: don't use lambda_scalar_per_access
         constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
-        static_assert(SliceLengths::At(SrcVectorDim) % SrcScalarPerVector == 0,
+        static_assert(SliceLengths::At(SrcVectorDim) % (SrcScalarPerVector_) == 0,
                       "SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector");
 
         constexpr auto src_dim_access_order = SrcDimAccessOrder{};
@@ -180,9 +201,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
             using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
             using src_vector_t    = typename src_vector_type::type;
 
-            auto src_vector_container =
-                src_vector_type{src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), true)};
-
             using dst_vector_type = vector_type_maker_t<DstData, SrcScalarPerVector>;
             using dst_vector_t    = typename dst_vector_type::type;
             dst_vector_type op_r_v;
@@ -193,17 +211,22 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                     if constexpr(decltype(src_element_op_)::is_pack8_invocable)
                         return math::min(8, SrcScalarPerVector);
                 }
-                if constexpr(is_detected<is_pack4_invocable_t, decltype(src_element_op_)>::value)
+                else if constexpr(is_detected<is_pack4_invocable_t,
+                                              decltype(src_element_op_)>::value)
                 {
                     if constexpr(decltype(src_element_op_)::is_pack4_invocable)
                         return math::min(4, SrcScalarPerVector);
                 }
-                if constexpr(is_detected<is_pack2_invocable_t, decltype(src_element_op_)>::value)
+                else if constexpr(is_detected<is_pack2_invocable_t,
+                                              decltype(src_element_op_)>::value)
                 {
                     if constexpr(decltype(src_element_op_)::is_pack2_invocable)
                         return math::min(2, SrcScalarPerVector);
                 }
-                return 1;
+                else
+                {
+                    return 1;
+                }
             };
 
             constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
@@ -211,6 +234,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
             using src_elem_op_vec_t = typename vector_type<SrcData, elem_op_vec_len>::type;
             using dst_elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;
 
+            auto src_vector_container = src_vector_type{
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize, true)};
+
             static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto idx) {
                 // apply the src elementwise op and convert to DstData under the hood if needed
                 src_element_op_(op_r_v.template AsType<dst_elem_op_vec_t>()(idx),
@@ -276,10 +302,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
             dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
         });
 #else
-
         // OOB Check
         constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
@@ -350,6 +375,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                       (is_same<f8_t, remove_cvref_t<DstData>>::value &&
                        SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
         {
+            static_assert(!is_same_v<remove_cvref_t<SrcData>, pk_i4_t>,
+                          "in-register transpose is not supported for pk_i4_t");
             // each transpose does
             // DstScalarPerVector # of src vectors in src_thread_scratch_
             // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
@@ -410,7 +437,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         }
         else
         {
-            static_ford<SliceLengths>{}([&](auto idx) {
+            constexpr auto packed_per_access = generate_sequence(
+                detail::lambda_scalar_per_access<SrcVectorDim, PackedSize>{}, Number<nDim>{});
+
+            constexpr auto packed_access_lengths = SliceLengths{} / packed_per_access;
+
+            static_ford<decltype(packed_access_lengths)>{}([&](auto idx) {
                 dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
             });
         }
@@ -438,7 +470,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         // src scalar per access on each dim
         // TODO: don't use this
         constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
 
@@ -526,13 +558,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
 
                 // apply DstElementwiseOperation
                 dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
-
-                dst_vector_container.template AsType<DstData>()(i) = dst_v;
             });
 
             // copy data from dst_vector_container to dst_buf
             dst_buf.template Set<dst_vector_t>(
-                dst_coord_.GetOffset(),
+                dst_coord_.GetOffset() / PackedSize,
                 is_dst_valid,
                 dst_vector_container.template AsType<dst_vector_t>()[I0]);
 
@@ -586,7 +616,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         // scalar per access on each dim
         // TODO: don't use lambda_scalar_per_access
         constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
@@ -644,7 +674,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         // scalar per access on each dim
         // TODO: don't use lambda_scalar_per_access
         constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
 
@@ -730,7 +760,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
     __device__ static constexpr auto GetSrcThreadScratchDescriptor()
     {
         constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
@@ -779,7 +809,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
     __device__ static constexpr auto GetSrcOOBThreadScratchDescriptor()
     {
         constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
@@ -790,7 +820,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
     {
         // 1st stage of transforms
         constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector_>{}, Number<nDim>{});
 
         constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
 
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index 5367c3d72..ad13c4431 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -429,7 +429,8 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
             (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+            (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
         "wrong! not implemented");
 
     using r_t     = typename vector_type<T, N>::type;
diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp
index 5dc67a5ad..6761c08f2 100644
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -11,6 +11,27 @@
 
 namespace ck {
 
+inline __device__ int amd_assembly_and_or_b32(int a, int b, int d)
+{
+    int c;
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(c) : "v"(a), "v"(b), "v"(d));
+    return c;
+}
+
+inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c)
+{
+    half2_t d;
+    asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
+    return d;
+}
+
+inline __device__ half2_t amd_assembly_pk_add_f16(half2_t a, half2_t b)
+{
+    half2_t c;
+    asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+    return c;
+}
+
 // c0 += inner_product(a, b0)
 // c1 += inner_product(a, b1)
 __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index a7dc071bc..86bc3c394 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -12,6 +12,15 @@ using bhalf_t = ushort;
 using half_t  = _Float16;
 using int4_t  = _BitInt(4);
 
+// custom data type - pack int4 data
+struct pk_i4_t
+{
+    using type = int8_t;
+    type data;
+    __host__ __device__ constexpr pk_i4_t() : data{type{}} {}
+    __host__ __device__ constexpr pk_i4_t(type init) : data{init} {}
+};
+
 inline constexpr auto next_pow2(uint32_t x)
 {
     // Precondition: x > 1.
@@ -165,6 +174,13 @@ struct scalar_type<int4_t>
 };
 #endif
 
+template <>
+struct scalar_type<pk_i4_t>
+{
+    using type                           = pk_i4_t;
+    static constexpr index_t vector_size = 1;
+};
+
 template <>
 struct scalar_type<f8_fnuz_t>
 {
@@ -1044,6 +1060,12 @@ struct nnvb_data_t_selector<bf8_ocp_t>
     using type = bf8_ocp_t::data_type;
 };
 
+template <>
+struct nnvb_data_t_selector<pk_i4_t>
+{
+    using type = pk_i4_t::type;
+};
+
 template <typename T, index_t N>
 struct non_native_vector_base<
     T,
@@ -1163,6 +1185,14 @@ struct scalar_type<non_native_vector_base<bf8_ocp_t, N>>
     static constexpr index_t vector_size = N;
 };
 
+template <index_t N>
+struct scalar_type<non_native_vector_base<pk_i4_t, N>>
+{
+    using type = typename non_native_vector_base<pk_i4_t, N>::data_t;
+
+    static constexpr index_t vector_size = N;
+};
+
 // non-native vector_type implementation
 template <typename T>
 struct vector_type<T, 1, typename std::enable_if_t<!is_native_type<T>()>>
@@ -1871,6 +1901,11 @@ using uint8x16_t = typename vector_type<uint8_t, 16>::type;
 using uint8x32_t = typename vector_type<uint8_t, 32>::type;
 using uint8x64_t = typename vector_type<uint8_t, 64>::type;
 
+// pack int4
+using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
+using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
+using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type;
+
 template <typename T>
 struct NumericLimits
 {
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 0dcc514a2..639aa1efe 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -54,7 +54,8 @@ struct DynamicBuffer
 
     template <typename X,
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value ||
+                                     !is_native_type<X>(),
                                  bool>::type = false>
     __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
     {
@@ -195,7 +196,8 @@ struct DynamicBuffer
 
     template <typename X,
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value ||
+                                     !is_native_type<X>(),
                                  bool>::type = false>
     __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
     {
diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp
index 835f56573..602e76abd 100644
--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
@@ -116,7 +116,8 @@ struct StaticBufferTupleOfVector
     // i is offset of S, not X. i should be aligned to X
     template <typename X,
               index_t I,
-              typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
+              typename enable_if<has_same_scalar_type<S, X>::value || !is_native_type<S>(),
+                                 bool>::type = false>
     __host__ __device__ constexpr auto GetAsType(Number<I> i) const
     {
         constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
@@ -134,7 +135,8 @@ struct StaticBufferTupleOfVector
     // i is offset of S, not X. i should be aligned to X
     template <typename X,
               index_t I,
-              typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
+              typename enable_if<has_same_scalar_type<S, X>::value || !is_native_type<S>(),
+                                 bool>::type = false>
     __host__ __device__ constexpr void SetAsType(Number<I> i, X x)
     {
         constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 1ae11fe9d..8dd5d086b 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -74,6 +74,17 @@ struct ReferenceGemm : public device::BaseOperator
                     {
                         ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
                     }
+                    else if constexpr(is_same_v<ADataType, pk_i4_t>)
+                    {
+                        uint8_t i4x2 = arg.a_m_k_(m, k).data;
+                        int8_t i4    = 0;
+                        if(k % 2 == 1)
+                            i4 = (i4x2 >> 0) & 0xf;
+                        else
+                            i4 = (i4x2 >> 4) & 0xf;
+                        i4  = i4 - 8;
+                        v_a = type_convert<ComputeTypeA>(i4);
+                    }
                     else
                     {
                         arg.a_element_op_(v_a, arg.a_m_k_(m, k));
@@ -84,6 +95,17 @@ struct ReferenceGemm : public device::BaseOperator
                     {
                         ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
                     }
+                    else if constexpr(is_same_v<BDataType, pk_i4_t>)
+                    {
+                        uint8_t i4x2 = arg.b_k_n_(k, n).data;
+                        int8_t i4    = 0;
+                        if(k % 2 == 1)
+                            i4 = (i4x2 >> 0) & 0xf;
+                        else
+                            i4 = (i4x2 >> 4) & 0xf;
+                        i4  = i4 - 8;
+                        v_b = type_convert<ComputeTypeB>(i4);
+                    }
                     else
                     {
                         arg.b_element_op_(v_b, arg.b_k_n_(k, n));
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 4358953a5..4a44c425a 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -22,6 +22,7 @@ using I8   = int8_t;
 using I32  = int32_t;
 using F8   = ck::f8_t;
 using BF8  = ck::bf8_t;
+using I4   = ck::pk_i4_t;
 
 using Empty_Tuple = ck::Tuple<>;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
index 52046a107..4218c51ca 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
@@ -166,11 +166,22 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+
 void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -810,6 +821,28 @@ struct DeviceOperationInstanceFactory<
             }
         }
 #endif
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs);
+            }
+        }
+
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+            }
+        }
+
         return op_ptrs;
     }
 };
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
index 188c9f68e..ade65eacf 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
@@ -97,6 +97,9 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 
+          device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+          device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+
           device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
           device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
           device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
new file mode 100644
index 000000000..8d109d134
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4   = pk_i4_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+#if 0
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+#endif
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|        Block-wiseGemm|               Block-wiseGemm| ACompType| BCompType| APermute| BPermute|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|              Pipeline|                     Pipeline|          |          |         |         |
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|             Scheduler|                     Verision|          |          |         |         |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                      |                             |          |          |         |         |
+
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,   128,   8,   32,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,   128,   8,   16,  16,   16,    4,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,   128,   8,   32,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..b060a92eb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
new file mode 100644
index 000000000..680788d66
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4  = pk_i4_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+#if 0
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+#endif
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm| ACompType| BCompType| APermute| BPermute|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|          |          |         |         |
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|          |          |         |         |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |          |          |         |         |
+
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,   128,   8,   32,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,   128,   8,   16,  16,   16,    4,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,   128,   8,   32,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..a884a3ec5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index 30f0da212..ed7e86ded 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -65,11 +65,13 @@ bool profile_gemm_universal_impl(int do_verification,
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
     Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
-    int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
-    int rotating_count    = std::max(
+    std::size_t total_gemm_needed =
+        a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
         1,
         std::min(n_iter,
                  static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
@@ -86,9 +88,13 @@ bool profile_gemm_universal_impl(int do_verification,
         a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2});
         b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 2});
         break;
-    default:
+    case 2:
         a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
     }
 
     using AElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -100,11 +106,10 @@ bool profile_gemm_universal_impl(int do_verification,
     const auto c_element_op = CElementOp{};
 
     DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
     DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
 
     using DeviceOp = ck::tensor_operation::device::DeviceGemmV2<ALayout,
                                                                 BLayout,
@@ -153,6 +158,84 @@ bool profile_gemm_universal_impl(int do_verification,
     // profile device GEMM instances
     for(auto& op_ptr : op_ptrs)
     {
+        const int KPerBlock = op_ptr->GetKPerBlock();
+
+        if(op_ptr->GetPermuteB())
+        {
+            int K1 = KPerBlock;
+            int K0 = K / KPerBlock;
+
+            // int K0, N, K1
+            for(int j = 0; j < K0; j++)
+            {
+                for(int i = 0; i < N; i++)
+                {
+                    for(int jj = 0; jj < K1; jj++)
+                    {
+                        b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                    }
+                }
+            }
+
+            if(is_same_v<BDataType, pk_i4_t> && is_same_v<ADataType, half_t>)
+            {
+                // vector pk_i4x4 permute
+                for(int i = 0; i < N; i++)
+                {
+                    for(int j = 0; j < K; j += 8)
+                    {
+                        int input[8];
+
+                        for(int k = 0; k < 4; k++)
+                        {
+                            int i4x2         = b_k_n_permute(j + k * 2, i);
+                            input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                            input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                        }
+
+                        // permute 01234567->20643175
+                        {
+                            int hi   = input[2];
+                            int lo   = input[0];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 0, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[6];
+                            int lo   = input[4];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 2, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[3];
+                            int lo   = input[1];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 4, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[7];
+                            int lo   = input[5];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 6, i) = i4x2;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            b_k_n_permute = b_k_n;
+        }
+
+        b_device_buf.ToDevice(b_k_n_permute.mData.data());
+
         std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
 
         if(KBatch > 0)
@@ -240,7 +323,15 @@ bool profile_gemm_universal_impl(int do_verification,
 
                 std::size_t flop = std::size_t(2) * M * N * K;
 
-                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                static constexpr index_t BPackedSize = []() {
+                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                        return 2;
+                    else
+                        return 1;
+                }();
+
+                std::size_t num_btype = sizeof(ADataType) * M * K +
+                                        sizeof(BDataType) * K * N / BPackedSize +
                                         sizeof(CDataType) * M * N;
 
                 float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 35e91f817..a0978eb6b 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -177,5 +177,4 @@ if(DL_KERNELS)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
 endif()
-
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index 990cbd292..a22d983da 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -1,10 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+#include <cstdlib>
+#include <initializer_list>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
-#include <cstdlib>
 
 #include "profiler/profile_gemm_universal_impl.hpp"
 #include "profiler_operation_registry.hpp"
@@ -27,6 +27,8 @@ enum struct GemmDataType
     F16_F8_F16,     // 5
     F16_F16_F16_F8, // 6
     F8_F8_BF16,     // 7
+    F16_I4_F16,     // 8
+    BF16_I4_BF16,   // 9
 };
 
 #define OP_NAME "gemm_universal"
@@ -39,7 +41,7 @@ int profile_gemm_universal(int argc, char* argv[])
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
                "f16->f8; 7: f8->bf16, "
-               "comp f8)\n");
+               "comp f8; 8: f16@i4; 9: bf16@i4\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -103,6 +105,7 @@ int profile_gemm_universal(int argc, char* argv[])
     using BF16 = ck::bhalf_t;
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using F8 = ck::f8_t;
+    using I4 = ck::pk_i4_t;
 #endif
 
     using Row = ck::tensor_layout::gemm::RowMajor;
@@ -207,6 +210,14 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
+    else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_I4_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, I4{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
 #endif
     else
     {
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 4097ca98f..f7177a7ab 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -17,7 +17,7 @@ fi
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
--D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
+-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \
-- 
GitLab


From 9e95d54cd2160dffc07c1197951a9ab1ca6c35f2 Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Thu, 2 Jan 2025 10:30:04 -0800
Subject: [PATCH 146/153] BF16 GEMM Stream-K (#1541)

* initial

* Cmake file

* successfull compilation but validation failed

* Cmake

* update

* gpu validation

* gemm universal

* gemm universal sk update

* sk bf16 universal instance

* gemm_universal_streamk.hpp

* only build for gfx94

* Cmakelist

* profiler update, bf16 sk only works at gfx42

* clang

* clang

* clang all

* no need flags

* cmake script

* delete comment

* gemm universal sk fix

* clang

* profiler fix

* clang

* update

* update

* delete comment

* code formatting

* cmake

* fix instance

* clang

* argument supported

* argument supported and clang

* update

* fix

* removing unnecessary comments

* clang formatting

* Update library/src/tensor_operation_instance/gpu/CMakeLists.txt

Co-authored-by: afagaj <john.afaganis@gmail.com>

* CopyRight Comment 2025

* clang reformatting

* copy right 2025

---------

Co-authored-by: Emin Ozturk <ozturk.27@osu.edu>
Co-authored-by: root <root@ctr-ubbsmc16.amd.com>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t004-008.hpcfund>
Co-authored-by: root <root@splinter-126-wr-d3.amd.com>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t006-001.hpcfund>
Co-authored-by: Muhammed Emin Ozturk <meozturk@login1.hpcfund>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t004-004.hpcfund>
Co-authored-by: Emin Ozturk <emin.ozturk@utah.edu>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t008-001.hpcfund>
Co-authored-by: afagaj <john.afaganis@gmail.com>
---
 example/01_gemm/CMakeLists.txt                |   3 +
 example/01_gemm/gemm_xdl_bf16.cpp             |   0
 example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp  |  59 +++
 example/01_gemm/gemm_xdl_streamk.cpp          |   1 -
 .../01_gemm/run_gemm_example_streamk_v2.inc   |   0
 .../device_gemm_xdl_cshuffle_streamk_v3.hpp   |   6 +-
 .../gpu/gemm_universal_streamk.hpp            | 500 ++++++++++++++++++
 .../gpu/CMakeLists.txt                        |   8 +-
 .../gpu/gemm_universal_streamk/CMakeLists.txt |  39 +-
 ...versal_streamk_bf16_bf16_bf16_km_kn_mn.hpp |  91 ++++
 ...16_bf16_km_kn_mn_comp_default_instance.cpp |  30 ++
 ...6_bf16_km_kn_mn_comp_kpadding_instance.cpp |  30 ++
 ...bf16_km_kn_mn_comp_mnkpadding_instance.cpp |  30 ++
 ..._bf16_km_kn_mn_comp_mnpadding_instance.cpp |  30 ++
 ..._bf16_km_kn_mn_mem_v1_default_instance.cpp |  31 ++
 ...bf16_km_kn_mn_mem_v1_kpadding_instance.cpp |  31 ++
 ...16_km_kn_mn_mem_v1_mnkpadding_instance.cpp |  31 ++
 ..._bf16_km_kn_mn_mem_v2_default_instance.cpp |  31 ++
 ...bf16_km_kn_mn_mem_v2_kpadding_instance.cpp |  31 ++
 ...16_km_kn_mn_mem_v2_mnkpadding_instance.cpp |  31 ++
 ...versal_streamk_bf16_bf16_bf16_km_nk_mn.hpp |  97 ++++
 ...16_bf16_km_nk_mn_comp_default_instance.cpp |  30 ++
 ...6_bf16_km_nk_mn_comp_kpadding_instance.cpp |  30 ++
 ..._bf16_km_nk_mn_comp_mkpadding_instance.cpp |  30 ++
 ...6_bf16_km_nk_mn_comp_mpadding_instance.cpp |  30 ++
 ..._bf16_km_nk_mn_mem_v1_default_instance.cpp |  31 ++
 ...bf16_km_nk_mn_mem_v1_kpadding_instance.cpp |  31 ++
 ...f16_km_nk_mn_mem_v1_mkpadding_instance.cpp |  31 ++
 ..._bf16_km_nk_mn_mem_v2_default_instance.cpp |  31 ++
 ...bf16_km_nk_mn_mem_v2_kpadding_instance.cpp |  31 ++
 ...f16_km_nk_mn_mem_v2_mkpadding_instance.cpp |  31 ++
 ...versal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp |  89 ++++
 ...16_bf16_mk_kn_mn_comp_default_instance.cpp |  30 ++
 ...6_bf16_mk_kn_mn_comp_kpadding_instance.cpp |  30 ++
 ...bf16_mk_kn_mn_comp_mnkpadding_instance.cpp |  30 ++
 ..._bf16_mk_kn_mn_comp_mnpadding_instance.cpp |  30 ++
 ..._bf16_mk_kn_mn_mem_v1_default_instance.cpp |  31 ++
 ...bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp |  31 ++
 ...16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp |  31 ++
 ..._bf16_mk_kn_mn_mem_v2_default_instance.cpp |  31 ++
 ...bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp |  31 ++
 ...16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp |  31 ++
 ...versal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp |  93 ++++
 ...16_bf16_mk_nk_mn_comp_default_instance.cpp |  30 ++
 ...6_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  30 ++
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |  31 ++
 ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  31 ++
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |  31 ++
 ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  31 ++
 .../src/profile_gemm_universal_streamk.cpp    |  21 +-
 script/cmake-ck-dev.sh                        |   2 +-
 51 files changed, 2101 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 example/01_gemm/CMakeLists.txt
 mode change 100644 => 100755 example/01_gemm/gemm_xdl_bf16.cpp
 create mode 100755 example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp
 mode change 100644 => 100755 example/01_gemm/gemm_xdl_streamk.cpp
 mode change 100755 => 100644 example/01_gemm/run_gemm_example_streamk_v2.inc
 mode change 100755 => 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
 mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/CMakeLists.txt
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 mode change 100755 => 100644 profiler/src/profile_gemm_universal_streamk.cpp

diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
old mode 100644
new mode 100755
index df7be0466..354e443b3
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -35,6 +35,9 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
 add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
 
+add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)
+
 add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
 
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
old mode 100644
new mode 100755
diff --git a/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp b/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp
new file mode 100755
index 000000000..5b56a4348
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2_Streamk_Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType,   BDataType,  CDataType,  AccDataType,  CShuffleDataType, 
+        PassThrough, PassThrough, PassThrough, GemmDefault, 
+        256,
+        128, 128, 
+        64, 8, 8,
+        16,   16,
+        4,    4,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 8, 8, 0,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 8, 8, 0,
+        1, 2, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+#include "run_gemm_example_streamk_v2.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_streamk.cpp b/example/01_gemm/gemm_xdl_streamk.cpp
old mode 100644
new mode 100755
index 5a02457da..dbdf7199e
--- a/example/01_gemm/gemm_xdl_streamk.cpp
+++ b/example/01_gemm/gemm_xdl_streamk.cpp
@@ -15,7 +15,6 @@ using F16 = ck::half_t;
 
 using ALayout = Row;
 using BLayout = Row;
-// using BLayout = Col;
 using CLayout = Row;
 
 using AElementOp = PassThrough;
diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc
old mode 100755
new mode 100644
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
old mode 100755
new mode 100644
index cfd9a1204..26be5cfc6
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -469,7 +469,11 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
         {
             return false;
         }
-
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> &&
+           arg.Streamk_sel > 0)
+        {
+            return false;
+        }
         if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                        GemmSpec == GemmSpecialization::NKPadding ||
                                                        GemmSpec == GemmSpecialization::MNKPadding ||
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
index f44c02517..18203e7d5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
@@ -238,6 +238,403 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin
                                                       PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+#endif
+
 #if(defined(CK_ENABLE_FP8))
 void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
@@ -527,6 +924,109 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
         }
 #endif
 
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
 #if(defined(CK_ENABLE_FP8))
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
                      is_same_v<CDataType, half_t>)
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
old mode 100644
new mode 100755
index dd023e6b5..d72281f43
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -183,6 +183,10 @@ FOREACH(subdir_path ${dir_list})
             message("bf8 instance found!")
             set(add_inst 1)
         endif()
+        if(("${cmake_instance}" MATCHES "_bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
+            message("bf16 instance found!")
+            set(add_inst 1)
+        endif()
         if(("${cmake_instance}" MATCHES "_fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
             message("fp16 instance found!")
             set(add_inst 1)
@@ -195,10 +199,6 @@ FOREACH(subdir_path ${dir_list})
             message("fp64 instance found!")
             set(add_inst 1)
         endif()
-        if("${cmake_instance}" MATCHES "_bf16" AND DTYPES MATCHES "bf16")
-            message("bf16 instance found!")
-            set(add_inst 1)
-        endif()
         if(("${cmake_instance}" MATCHES "_int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
             message("int8 instance found!")
             set(add_inst 1)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
index 08746a52d..e1612bcd2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
@@ -64,6 +64,43 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp)
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
+        
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp)
 
 add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
new file mode 100755
index 000000000..b4554fc6a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMKPadding  = GemmSpecialization::MKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        // Can we support this kind of odd case? 224(256) = 28*8 + (4*8)
+        //DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   4,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
new file mode 100755
index 000000000..9b21e0bbd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
new file mode 100755
index 000000000..9b9195a44
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100755
index 000000000..d941d769c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
new file mode 100755
index 000000000..0cc69b589
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp
new file mode 100755
index 000000000..e5cf052b3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances<Intrawave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp
new file mode 100755
index 000000000..9ce9f8678
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances<Intrawave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100755
index 000000000..c95d90793
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances<Intrawave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp
new file mode 100755
index 000000000..1071a2ac0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances<Interwave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp
new file mode 100755
index 000000000..ad569f721
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances<Interwave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100755
index 000000000..0dad13c7e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances<Interwave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
new file mode 100755
index 000000000..b6a60a1f3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMKPadding  = GemmSpecialization::MKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   8,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   8,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   8,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   8,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
new file mode 100755
index 000000000..1cca948e5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
new file mode 100755
index 000000000..85bed59c1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
new file mode 100755
index 000000000..e6cb58f28
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmMKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
new file mode 100755
index 000000000..feeed4cc7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmMPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
new file mode 100755
index 000000000..32ba00037
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances<Intrawave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
new file mode 100755
index 000000000..2a7a683c3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances<Intrawave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
new file mode 100755
index 000000000..975313603
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances<Intrawave,
+                                                                                GemmMKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
new file mode 100755
index 000000000..f9175984f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances<Interwave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
new file mode 100755
index 000000000..181faa74b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances<Interwave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
new file mode 100755
index 000000000..417fb4066
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances<Interwave,
+                                                                                GemmMKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
new file mode 100755
index 000000000..763ac4fac
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMKPadding  = GemmSpecialization::MKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100755
index 000000000..8b2bfb5d2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100755
index 000000000..a7c33ffdc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100755
index 000000000..adc2f23d4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100755
index 000000000..0336f6466
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
new file mode 100755
index 000000000..54488f269
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
new file mode 100755
index 000000000..8477a48be
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100755
index 000000000..0621df013
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
new file mode 100755
index 000000000..49fd1ccd3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
new file mode 100755
index 000000000..354231624
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100755
index 000000000..dff56ca62
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
new file mode 100755
index 000000000..7a59823d9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMKPadding  = GemmSpecialization::MKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        // AGPR Spill
+        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        // AGPR Spill when use permuted lds layout. so, use padding for these two.
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   4,   4,  16,   16,    4,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100755
index 000000000..e192bf14c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100755
index 000000000..d58ec3eb3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100755
index 000000000..545ef40b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
new file mode 100755
index 000000000..d899d5704
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100755
index 000000000..a5a5640eb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
new file mode 100755
index 000000000..a4e69f0a2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                                GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100755
new mode 100644
index 85f6c2577..a94bb866f
--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -83,8 +83,9 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
         rotating = std::stoull(argv[18]) * 1024 * 1024;
     }
 
-    using F32 = float;
-    using F16 = ck::half_t;
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
 
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using F8 = ck::f8_t;
@@ -165,6 +166,22 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
         return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
 #endif
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{});
+    }
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index f7177a7ab..6089fc7a7 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -15,7 +15,7 @@ else
 fi
 
 cmake                                                                                             \
--D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_PREFIX_PATH=/opt/rocm/                                                                   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
 -D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
-- 
GitLab


From 17e8efb573781febcf3256b10751e7e39b1a2197 Mon Sep 17 00:00:00 2001
From: John Afaganis <john.afaganis@amd.com>
Date: Thu, 2 Jan 2025 19:50:07 -0700
Subject: [PATCH 147/153] Add afagaj to CODEOWNERS (#1787)

---
 .github/CODEOWNERS | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d7a6b1778..f6ab388e2 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
+* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
 # Documentation files
-docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
-*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
-*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
-.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
+docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca
+library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
-- 
GitLab


From 4bc610416ada7504c62d02c5cde0187f22f59d80 Mon Sep 17 00:00:00 2001
From: feli <felix.li@amd.com>
Date: Fri, 3 Jan 2025 14:28:59 +0800
Subject: [PATCH 148/153] Ck tile/layernorm:  implement naive reduce, opt
 performance  (#1784)

* add no welford

* enable output raw

* raw of int8

* fix build

* fix smoke test err

* [ck_tile]layernorm: fix welford ok, set int8 and bf16 small N as default and others open by generate

* [cktile]layernorm, fix err commit files and remove uselss

* fix quant 8192 err & change norm_reduce class and file name

---------

Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: carlushuang <carlus.huang@amd.com>
---
 example/ck_tile/02_layernorm2d/generate.py    | 160 ++++++++++--------
 .../02_layernorm2d/script/smoke_test.sh       |   3 +-
 ...ayernorm2d_fwd_pipeline_default_policy.hpp |  57 ++++---
 .../layernorm2d_fwd_pipeline_one_pass.hpp     |  40 +++--
 .../layernorm2d_fwd_pipeline_two_pass.hpp     |  20 ++-
 .../pipeline/layernorm2d_fwd_traits.hpp       |   2 +
 .../ops/{welford.hpp => norm_reduce.hpp}      |   6 +-
 .../block/block_norm_reduce.hpp}              | 126 ++++++++------
 .../block/block_norm_reduce_problem.hpp}      |   9 +-
 .../thread/thread_welford.hpp                 |   0
 10 files changed, 253 insertions(+), 170 deletions(-)
 rename include/ck_tile/ops/{welford.hpp => norm_reduce.hpp} (54%)
 rename include/ck_tile/ops/{welford/block/block_welford.hpp => norm_reduce/block/block_norm_reduce.hpp} (79%)
 rename include/ck_tile/ops/{welford/block/block_welford_problem.hpp => norm_reduce/block/block_norm_reduce_problem.hpp} (66%)
 rename include/ck_tile/ops/{welford => norm_reduce}/thread/thread_welford.hpp (100%)

diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index ca9e432a4..0581c4597 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -58,6 +58,7 @@ template <typename XDataType_,
           bool kPadN_,
           bool kSaveMeanInvStd_,
           bool kFastFDiv_,
+          bool kWelford_,
           bool kTwoPass_,
           ck_tile::index_t kFusedAdd_ = 0,
           ck_tile::index_t kFusedQuant_ = 0>
@@ -120,6 +121,7 @@ struct layernorm2d_fwd_traits_
     static constexpr bool kPadN           = kPadN_;
     static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
     static constexpr bool kFastFDiv       = kFastFDiv_;
+    static constexpr bool kWelford        = kWelford_;
     static constexpr bool kTwoPass        = kTwoPass_;
     static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
     static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
@@ -137,6 +139,7 @@ template <typename XDataType_,
           bool kPadN_,
           bool kSaveMeanInvStd_,
           bool kFastFDiv_,
+          bool kWelford_,
           bool kTwoPass_,
           int  kFusedAdd_,
           int  kFusedQuant_>
@@ -152,6 +155,7 @@ using traits_ = layernorm2d_fwd_traits_<XDataType_,
                                        kPadN_,
                                        kSaveMeanInvStd_,
                                        kFastFDiv_,
+                                       kWelford_,
                                        kTwoPass_,
                                        kFusedAdd_,
                                        kFusedQuant_>;
@@ -184,6 +188,7 @@ float layernorm2d_fwd_(const S& s, A a)
     using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
         Traits_::kSaveMeanInvStd,
         Traits_::kFastFDiv,
+        Traits_::kWelford,
         Traits_::kTwoPass,
         static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
         static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
@@ -204,12 +209,13 @@ float layernorm2d_fwd_(const S& s, A a)
     using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
     using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
 
-    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
+    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, true>;
     using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
 
     static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
+    static constexpr bool UseRawStore = sizeof(YDataType) == 4;
     using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, XScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
-            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, false,  true/*max3*/>>;
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, UseRawStore,  true/*max3*/>>;
 
     using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
 
@@ -274,7 +280,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
 #include "layernorm2d_fwd_api_common.hpp"
 
 // clang-format off
-//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf    2p      add  sweep
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf   welford   2p    add  sweep
 {F_instance_def}
 // clang-format on
 
@@ -362,6 +368,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         F_kPadN : bool
         F_kSaveMeanInvStd_ : bool
         F_kFastFDiv_ : bool
+        F_kWelford_ : bool
         F_kTwoPass_ : bool
         F_kFusedAdd : int
         F_kFusedQuant : int
@@ -369,7 +376,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         @property
         def trait_name(self) ->str:
             t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
-            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}, {BOOL_MAP(self.F_kWelford_):5}'
             t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
             return t_
 
@@ -422,11 +429,10 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
     def name_common_header(self) -> str:
         return 'layernorm2d_fwd_api_common'
 
-    @property
-    def content_api(self) -> str:
+    def content_api(self, args) -> str:
         # 1 sort based on dtype
         t_dtype_dict = dict()
-        blobs = self.get_blobs()
+        blobs = self.get_blobs(args)
         for blob in blobs:
             if blob.F_DataTypePair not in t_dtype_dict:
                 t_dtype_dict[blob.F_DataTypePair] = {}
@@ -462,8 +468,8 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
                         inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
                                             F_VEC_COND = _cond, F_instance_func=ins.call_name)
                     #inner_str = inner_str + vec_str
-                n_cnd = f'(a.n <= {n_})' if (i_n < len(blob_per_t) - 1) else ''
-                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
+                n_cnd = f'(a.n <= {n_})' if isinstance(n_, int) else ''
+                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t), not isinstance(n_, int)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
             prec_i, prec_o = dtype_.split(',')
             d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
 
@@ -474,7 +480,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
     def content_common_header(self) -> str:
         return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
 
-    def get_blobs(self):
+    def get_blobs(self, args):
         h_traits = layernorm_fwd_codegen.h_traits
         h_instance = layernorm_fwd_codegen.h_instance
 
@@ -484,60 +490,61 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         scale_list = [('fp32,fp32')]
         dtype_list = [('fp16,fp16'), ('bf16,bf16'),
                         ('fp16,int8'), ('bf16,int8')] # NOTE: only fused-dynamic-quant use int8 out
+        types_8bit = ('int8', 'fp8')
+        types_16bit = ('int16', 'fp16', 'bf16')
         #fused_add_list = [0, 1, 2]
         #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant
         fused_add_list = [0, 1]
         fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
-
-        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  2p     add    sweep
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, False,   0,    0)],
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, False,   0,    0)],
-                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, False,   0,    0)],
-                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, False,   0,    0)],
-                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, False,   0,    0)],
-                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, False,   0,    0)],
-                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, False,   0,    0)],
-                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, False,   0,    0)],
-                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, False,   0,    0)],
-                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, False,   0,    0)],
-                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, False,   0,    0)],
-                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, False,   0,    0)],
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True,  True,   0,    0)]}
+        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  welford   2p     add   sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, True,   False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, True,   False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, True,   False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, True,   False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, True,   False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, True,   False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, True,   False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, True,   False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, True,   False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,   False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, True,   False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, True,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, True,   False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, True,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, True,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,    True,   0,    0)]}
         total_blob = list()
         for hs_key in h_trait_dict:
             hs = h_trait_dict[hs_key]
@@ -558,16 +565,27 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
                     h_.F_YScaleDataType = scale_x
                     h_.F_kFusedAdd = fused_add
                     h_.F_kFusedQuant = fused_quant
+                    # disable welford update for 8bit and 16 bit smallN
+                    if not h_.F_kTwoPass_:
+                        #disable 16 bit when set args disable_16b_welford
+                        if args.disable_16b_welford and prec_i in types_16bit:
+                            h_.F_kWelford_ = False
+                        #disable 8bit by default
+                        elif prec_i in types_8bit or prec_o in types_8bit:
+                            h_.F_kWelford_ = False
+                        #disable 16bit small N
+                        elif prec_i in types_16bit and hs_key == '64':
+                            h_.F_kWelford_ = False
                     current_hs.append(h_) # + "\n"
                 #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
                 current_n_str = 'big' if hs_key == 'big' else current_n
                 total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs))
         return total_blob
 
-    def list_blobs(self) -> None:
+    def list_blobs(self, args) -> None:
         w_p = Path(self.working_path)
         list_p = w_p / 'layernorm2d_fwd_blobs.txt'
-        blobs = self.get_blobs()
+        blobs = self.get_blobs(args)
         with list_p.open('w') as list_f:
             # api related file
             list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
@@ -576,11 +594,12 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
             for b in blobs:
                 list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
 
-    def gen_blobs(self) -> None:
+    def gen_blobs(self, args) -> None:
         w_p = Path(self.working_path)
-        (w_p / (self.name_api + ".cpp")).write_text(self.content_api)
+        w_str = self.content_api(args)
+        (w_p / (self.name_api + ".cpp")).write_text(w_str)
         (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
-        blobs = self.get_blobs()
+        blobs = self.get_blobs(args)
         for b in blobs:
             (w_p / (b.name + ".cpp")).write_text(b.content)
 
@@ -588,14 +607,14 @@ def list_blobs(args):
     api_list = args.api.split(',')
     for api in api_list:
         if api == 'fwd':
-            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs()
+            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs(args)
 
 
 def gen_blobs(args):
     api_list = args.api.split(',')
     for api in api_list:
         if api == 'fwd':
-            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs()
+            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs(args)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -663,6 +682,13 @@ if __name__ == "__main__":
         help="codegen receipt."
     )
 
+    parser.add_argument(
+        "--disable_16b_welford",
+        default=False,
+        required=False,
+        help="enable/disable welford for 16bit datatype n > 64"
+    )
+
     args = parser.parse_args()
 
     # print(f'{args.list_blobs}-{args.gen_blobs}')
diff --git a/example/ck_tile/02_layernorm2d/script/smoke_test.sh b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
index b7fd354bb..3f5c3eb13 100755
--- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
@@ -27,7 +27,8 @@ $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=2734
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=3182
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9   -n=4096
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=8192
-#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=9120
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
 #$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=17134
 done
 done
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
index 724f6261d..37f87b4fe 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
@@ -4,8 +4,8 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
-#include "ck_tile/ops/welford/block/block_welford.hpp"
+#include "ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp"
+#include "ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp"
 
 namespace ck_tile {
 
@@ -43,36 +43,38 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelford()
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockNormReduce()
     {
-        using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
-                                       typename Problem::ComputeDataType,
-                                       typename Problem::BlockShape,
-                                       Problem::Traits::kFastFDiv>;
-
-        return BlockWelford<P_>{};
+        using P_ = BlockNormReduceProblem<typename Problem::ComputeDataType,
+                                          typename Problem::ComputeDataType,
+                                          typename Problem::BlockShape,
+                                          Problem::Traits::kFastFDiv,
+                                          Problem::Traits::kWelford>;
+        return BlockNormReduce<P_>{};
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordSync()
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockNormReduceSync()
     {
-        using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
-                                       typename Problem::ComputeDataType,
-                                       typename Problem::BlockShape,
-                                       Problem::Traits::kFastFDiv>;
+        using P_ = BlockNormReduceProblem<typename Problem::ComputeDataType,
+                                          typename Problem::ComputeDataType,
+                                          typename Problem::BlockShape,
+                                          Problem::Traits::kFastFDiv,
+                                          Problem::Traits::kWelford>;
 
-        return BlockWelfordSync<P_>{};
+        return BlockNormReduceSync<P_>{};
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWelfordCrossWarpSync()
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockNormReduceCrossWarpSync()
     {
-        using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
-                                       typename Problem::ComputeDataType,
-                                       typename Problem::BlockShape,
-                                       Problem::Traits::kFastFDiv>;
+        using P_ = BlockNormReduceProblem<typename Problem::ComputeDataType,
+                                          typename Problem::ComputeDataType,
+                                          typename Problem::BlockShape,
+                                          Problem::Traits::kFastFDiv,
+                                          Problem::Traits::kWelford>;
 
-        return BlockWelfordCrossWarpSync<P_>{};
+        return BlockNormReduceCrossWarpSync<P_>{};
     }
 
     template <typename Problem>
@@ -80,19 +82,20 @@ struct Layernorm2dFwdPipelineDefaultPolicy
     {
         if constexpr(Problem::kNeedCrossWarpSync)
         {
-            using P_ = BlockWelfordProblem<typename Problem::ComputeDataType,
-                                           typename Problem::ComputeDataType,
-                                           typename Problem::BlockShape,
-                                           Problem::Traits::kFastFDiv>;
+            using P_ = BlockNormReduceProblem<typename Problem::ComputeDataType,
+                                              typename Problem::ComputeDataType,
+                                              typename Problem::BlockShape,
+                                              Problem::Traits::kFastFDiv,
+                                              Problem::Traits::kWelford>;
 
-            using block_welford = BlockWelford<P_>;
+            using block_welford = BlockNormReduce<P_>;
             using x_block_tile =
                 decltype(make_static_distributed_tensor<typename Problem::ComputeDataType>(
                     MakeXBlockTileDistribution<Problem>()));
             using mean_var_block_tile =
                 decltype(block_welford::template MakeMeanVarBlockTile<x_block_tile>());
 
-            return GetBlockWelfordCrossWarpSync<Problem>()
+            return GetBlockNormReduceCrossWarpSync<Problem>()
                 .template GetSmemSize<mean_var_block_tile>();
         }
         else
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index eefdaf917..a30a9256a 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -37,6 +37,7 @@ struct Layernorm2dFwdPipelineOnePass
     static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
     static constexpr bool kPadN              = Problem::Traits::kPadN;
     static constexpr bool kFastFDiv          = Problem::Traits::kFastFDiv;
+    static constexpr bool kWelford           = Problem::Traits::kWelford;
     static constexpr auto kFusedAdd          = Problem::Traits::kFusedAdd;
     static constexpr auto kFusedQuant        = Problem::Traits::kFusedQuant;
 
@@ -95,11 +96,16 @@ struct Layernorm2dFwdPipelineOnePass
         int cur_count = 0;
         int max_count =
             block_tile_welford_calculate_max_count<typename Problem::BlockShape>(row_size);
-        auto block_welford      = Policy::template GetBlockWelford<Problem>();
-        auto block_welford_sync = Policy::template GetBlockWelfordSync<Problem>();
-        auto block_welford_cross_warp_sync =
-            Policy::template GetBlockWelfordCrossWarpSync<Problem>();
-
+        auto block_norm_reduce      = Policy::template GetBlockNormReduce<Problem>();
+        auto block_norm_reduce_sync = Policy::template GetBlockNormReduceSync<Problem>();
+        auto block_norm_reduce_cross_warp_sync =
+            Policy::template GetBlockNormReduceCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(cast_tile<ComputeDataType>(x));
+        auto mean         = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
+        auto var          = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
+        clear_tile(mean);
+        clear_tile(var);
         // load gamma/beta (TODO: support no gamma/beta?)
         const auto gamma = load_tile(gamma_window);
         const auto beta  = load_tile(beta_window);
@@ -117,12 +123,21 @@ struct Layernorm2dFwdPipelineOnePass
                 store_tile(y_residual_window, cast_tile<YResidualDataType>(acc));
         }
 
-        // compute welford each-thread->cross-lane->cross-warp
-        auto [mean, var] = block_welford(acc, cur_count, max_count);
-        block_welford_sync(mean, var, cur_count);
-        block_welford_cross_warp_sync(mean, var, cur_count, smem);
-        block_tile_welford_post_scale_var(var, cur_count, constant<kFastFDiv>{});
-
+        // compute reduce each-thread->cross-lane->cross-warp
+        block_norm_reduce(acc, mean, var, cur_count, max_count);
+        block_norm_reduce_sync(mean, var, cur_count);
+        block_norm_reduce_cross_warp_sync(mean, var, cur_count, smem);
+        if(kWelford)
+        {
+            block_tile_welford_post_scale_var(var, cur_count, constant<kFastFDiv>{});
+        }
+        else
+        {
+            sweep_tile(mean, [&](auto idx) {
+                mean(idx) = mean(idx) / type_convert<MeanDataType>(row_size);
+                var(idx)  = var(idx) / type_convert<MeanDataType>(row_size) - mean(idx) * mean(idx);
+            });
+        }
         // compute inv-std
         auto inv_std = tile_elementwise_in(
             [&](const auto& v_) {
@@ -153,8 +168,7 @@ struct Layernorm2dFwdPipelineOnePass
             const auto beta_  = type_convert<ComputeDataType>(beta[j_idx]);
 
             auto ln_ = (acc[idx] - mean_[i_idx]) * inv_std[i_idx] * gamma_ + beta_;
-
-            ln(idx) = ln_;
+            ln(idx)  = ln_;
         });
 
         if constexpr(kFusedQuant == Layernorm2dFusedQuantEnum::DYNAMIC_QUANT ||
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index 6a86cc43c..4a37be877 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -36,6 +36,7 @@ struct Layernorm2dFwdPipelineTwoPass
     static constexpr bool kPadM              = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
     static constexpr bool kPadN              = Problem::Traits::kPadN;
     static constexpr bool kFastFDiv          = Problem::Traits::kFastFDiv;
+    static constexpr bool kWelford           = Problem::Traits::kWelford;
     static constexpr auto kFusedAdd          = Problem::Traits::kFusedAdd;
     static constexpr auto kFusedQuant        = Problem::Traits::kFusedQuant;
 
@@ -77,6 +78,7 @@ struct Layernorm2dFwdPipelineTwoPass
                                    void* smem,
                                    Epilogue) const
     {
+        static_assert(kWelford == true, "2 pass only supports welford merge");
         auto x_window =
             make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
         auto gamma_window = make_tile_window(
@@ -102,14 +104,14 @@ struct Layernorm2dFwdPipelineTwoPass
         int max_count =
             (num_n_tile_iteration - 1) * count_per_iter +
             block_tile_welford_calculate_max_count<typename Problem::BlockShape>(last_iter_n);
-        auto block_welford      = Policy::template GetBlockWelford<Problem>();
-        auto block_welford_sync = Policy::template GetBlockWelfordSync<Problem>();
-        auto block_welford_cross_warp_sync =
-            Policy::template GetBlockWelfordCrossWarpSync<Problem>();
+        auto block_norm_reduce      = Policy::template GetBlockNormReduce<Problem>();
+        auto block_norm_reduce_sync = Policy::template GetBlockNormReduceSync<Problem>();
+        auto block_norm_reduce_cross_warp_sync =
+            Policy::template GetBlockNormReduceCrossWarpSync<Problem>();
 
         using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(x_window)));
-        auto mean         = block_welford.template MakeMeanVarBlockTile<XTensorType>();
-        auto var          = block_welford.template MakeMeanVarBlockTile<XTensorType>();
+        auto mean         = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
+        auto var          = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
 
         for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
         {
@@ -133,11 +135,11 @@ struct Layernorm2dFwdPipelineTwoPass
                     move_tile_window(y_residual_window, {0, Block_N});
                 }
             }
-            block_welford(acc, mean, var, cur_count, max_count);
+            block_norm_reduce(acc, mean, var, cur_count, max_count);
         }
 
-        block_welford_sync(mean, var, cur_count);
-        block_welford_cross_warp_sync(mean, var, cur_count, smem);
+        block_norm_reduce_sync(mean, var, cur_count);
+        block_norm_reduce_cross_warp_sync(mean, var, cur_count, smem);
         block_tile_welford_post_scale_var(var, cur_count, constant<kFastFDiv>{});
 
         // compute inv-std
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
index e8c22f8ab..045bd24e4 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp
@@ -40,6 +40,7 @@ template<> struct Layernorm2dFusedQuantEnumName<Layernorm2dFusedQuantEnum::SMOOT
 template <bool kPadN_,
           bool kSaveMeanInvStd_,
           bool kFastFDiv_,
+          bool kWelford_,
           bool kTwoPass_,
           Layernorm2dFusedAddEnum kFusedAdd_,
           Layernorm2dFusedQuantEnum kFusedQuant_>
@@ -48,6 +49,7 @@ struct Layernorm2dFwdTraits
     static constexpr bool kPadN                            = kPadN_;
     static constexpr bool kSaveMeanInvStd                  = kSaveMeanInvStd_;
     static constexpr bool kFastFDiv                        = kFastFDiv_;
+    static constexpr bool kWelford                         = kWelford_;
     static constexpr bool kTwoPass                         = kTwoPass_;
     static constexpr Layernorm2dFusedAddEnum kFusedAdd     = kFusedAdd_;
     static constexpr Layernorm2dFusedQuantEnum kFusedQuant = kFusedQuant_;
diff --git a/include/ck_tile/ops/welford.hpp b/include/ck_tile/ops/norm_reduce.hpp
similarity index 54%
rename from include/ck_tile/ops/welford.hpp
rename to include/ck_tile/ops/norm_reduce.hpp
index a4c479dd9..02d8eabd8 100644
--- a/include/ck_tile/ops/welford.hpp
+++ b/include/ck_tile/ops/norm_reduce.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include "ck_tile/ops/welford/block/block_welford.hpp"
-#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
-#include "ck_tile/ops/welford/thread/thread_welford.hpp"
+#include "ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp"
+#include "ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp"
+#include "ck_tile/ops/norm_reduce/thread/thread_welford.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
similarity index 79%
rename from include/ck_tile/ops/welford/block/block_welford.hpp
rename to include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
index 56ca86d9d..15ac02163 100644
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
@@ -4,22 +4,23 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/welford/thread/thread_welford.hpp"
+#include "ck_tile/ops/norm_reduce/thread/thread_welford.hpp"
 
 namespace ck_tile {
 
 template <typename Problem_, typename Policy_ = void>
-struct BlockWelford
+struct BlockNormReduce
 {
     using Problem                   = remove_cvref_t<Problem_>;
     using XDataType                 = typename Problem::XDataType;
     using ComputeDataType           = typename Problem::ComputeDataType;
     static constexpr bool kFastFDiv = Problem::kFastFDiv;
+    static constexpr bool kWelford  = Problem::kWelford;
 
-    CK_TILE_DEVICE constexpr BlockWelford() {}
+    CK_TILE_DEVICE constexpr BlockNormReduce() {}
 
     // [CAUSION] - max_count_ is to deal with the padding problem
-    // max_count_ is depend on caller, eg: naive and splitN welford will have different
+    // max_count_ is depend on caller, eg: naive and splitN norm_reduce will have different
     // calculation of max_count_
     // -> use block_welford_calculate_max_count to compute
     template <typename XDistributedTensor_,
@@ -40,18 +41,24 @@ struct BlockWelford
             if(cur_count_ < max_count_)
             {
                 ++cur_count_;
-
                 sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
                     constexpr auto in_dstr_idx  = make_tuple(dstr_idx_i0, dstr_idx_i1);
                     constexpr auto out_dstr_idx = make_tuple(dstr_idx_i0);
 
                     auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
-
-                    welford_update(mean_tensor(out_dstr_idx),
-                                   var_tensor(out_dstr_idx),
-                                   x,
-                                   cur_count_,
-                                   constant<kFastFDiv>{});
+                    if(kWelford)
+                    {
+                        welford_update(mean_tensor(out_dstr_idx),
+                                       var_tensor(out_dstr_idx),
+                                       x,
+                                       cur_count_,
+                                       constant<kFastFDiv>{});
+                    }
+                    else
+                    {
+                        mean_tensor(out_dstr_idx) += x;
+                        var_tensor(out_dstr_idx) += x * x;
+                    }
                 });
             }
         });
@@ -91,10 +98,11 @@ struct BlockWelford
 };
 
 template <typename Problem_, typename Policy_ = void>
-struct BlockWelfordSync
+struct BlockNormReduceSync
 {
     using Problem                   = remove_cvref_t<Problem_>;
     static constexpr bool kFastFDiv = Problem::kFastFDiv;
+    static constexpr bool kWelford  = Problem::kWelford;
 
     template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
     CK_TILE_DEVICE void
@@ -152,36 +160,48 @@ struct BlockWelfordSync
                             (number<lid_over_rid_derivative << istage.value>{}.value);
 
                         // pull data from remote lane
-                        const auto v_remote_mean  = warp_shuffle(v_local_mean, src_lane);
-                        const auto v_remote_var   = warp_shuffle(v_local_var, src_lane);
-                        const auto v_remote_count = warp_shuffle(v_local_count, src_lane);
-
-                        // welford merge
-                        welford_merge(v_local_mean,
-                                      v_local_var,
-                                      v_local_count,
-                                      v_remote_mean,
-                                      v_remote_var,
-                                      v_remote_count,
-                                      constant<kFastFDiv>{});
+                        const auto v_remote_mean = warp_shuffle(v_local_mean, src_lane);
+                        const auto v_remote_var  = warp_shuffle(v_local_var, src_lane);
+                        if(kWelford)
+                        {
+                            const auto v_remote_count = warp_shuffle(v_local_count, src_lane);
+
+                            // norm_reduce merge
+                            welford_merge(v_local_mean,
+                                          v_local_var,
+                                          v_local_count,
+                                          v_remote_mean,
+                                          v_remote_var,
+                                          v_remote_count,
+                                          constant<kFastFDiv>{});
+                        }
+                        else
+                        {
+                            v_local_mean += v_remote_mean;
+                            v_local_var += v_remote_var;
+                        }
                     });
                 }
             });
 
             mean_tensor.get_thread_buffer()(i) = v_local_mean;
             var_tensor.get_thread_buffer()(i)  = v_local_var;
-
-            count = v_local_count;
+            if(kWelford)
+            {
+                count = v_local_count;
+            }
         });
     }
 };
 
 template <typename Problem_, typename Policy_ = void>
-struct BlockWelfordCrossWarpSync
+struct BlockNormReduceCrossWarpSync
 {
     using Problem                   = remove_cvref_t<Problem_>;
     using BlockShape                = typename Problem::BlockShape;
     static constexpr bool kFastFDiv = Problem::kFastFDiv;
+    static constexpr bool kWelford  = Problem::kWelford;
+    using smem_dtype                = std::conditional_t<kWelford, fp32x4_t, fp32x2_t>;
 
     template <typename MeanDistributedTensor_>
     CK_TILE_DEVICE static constexpr index_t GetReduceWarps()
@@ -252,7 +272,7 @@ struct BlockWelfordCrossWarpSync
         static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
 
         // Note: we always pack everything into fp32x4
-        fp32x4_t* smem_ptr              = reinterpret_cast<fp32x4_t*>(smem);
+        smem_dtype* smem_ptr            = reinterpret_cast<smem_dtype*>(smem);
         const index_t lane_id           = get_lane_id();
         const index_t warp_id           = get_warp_id();
         constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
@@ -267,11 +287,13 @@ struct BlockWelfordCrossWarpSync
         if(lane_id == 0)
         {
             static_for<0, thread_buf_size, 1>{}([&](auto i) {
-                fp32x4_t local_scratch_;
+                smem_dtype local_scratch_;
                 local_scratch_[0] = bit_cast<float>(mean_tensor.get_thread_buffer()[i]);
                 local_scratch_[1] = bit_cast<float>(var_tensor.get_thread_buffer()[i]);
-                local_scratch_[2] = bit_cast<float>(count);
-
+                if(kWelford)
+                {
+                    local_scratch_[2] = bit_cast<float>(count);
+                }
                 smem_ptr[smem_offset + i * num_warps] = local_scratch_;
             });
         }
@@ -280,7 +302,7 @@ struct BlockWelfordCrossWarpSync
         // load from smem. here we let everythread to do compute :)
         index_t local_warp_id = warp_id / num_reduce_warps;
         index_t local_smem_os = local_warp_id * num_reduce_warps;
-        fp32x4_t all_scratch[thread_buf_size * num_reduce_warps];
+        smem_dtype all_scratch[thread_buf_size * num_reduce_warps];
         static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
             static_for<0, num_reduce_warps, 1>{}([&](auto i_1) {
                 all_scratch[i_0 * num_reduce_warps + i_1] =
@@ -293,32 +315,40 @@ struct BlockWelfordCrossWarpSync
 
         static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
             // TODO: use descriptor for this
-            auto v_local       = all_scratch[i_0 * num_reduce_warps];
-            auto v_local_mean  = bit_cast<DataType>(v_local[0]);
-            auto v_local_var   = bit_cast<DataType>(v_local[1]);
-            auto v_local_count = bit_cast<int>(v_local[2]);
+            auto v_local      = all_scratch[i_0 * num_reduce_warps];
+            auto v_local_mean = bit_cast<DataType>(v_local[0]);
+            auto v_local_var  = bit_cast<DataType>(v_local[1]);
+            int v_local_count = kWelford ? bit_cast<int>(v_local[2]) : 0;
 
             // further reduce mean/var
             static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) {
                 constexpr auto i_1        = number<i_1_n1 + 1>{};
-                const fp32x4_t v_remote   = all_scratch[i_0 * num_reduce_warps + i_1];
+                const smem_dtype v_remote = all_scratch[i_0 * num_reduce_warps + i_1];
                 const auto v_remote_mean  = bit_cast<DataType>(v_remote[0]);
                 const auto v_remote_var   = bit_cast<DataType>(v_remote[1]);
-                const auto v_remote_count = bit_cast<int>(v_remote[2]);
-
-                welford_merge(v_local_mean,
-                              v_local_var,
-                              v_local_count,
-                              v_remote_mean,
-                              v_remote_var,
-                              v_remote_count,
-                              constant<kFastFDiv>{});
+                if(kWelford)
+                {
+                    const auto v_remote_count = bit_cast<int>(v_remote[2]);
+
+                    welford_merge(v_local_mean,
+                                  v_local_var,
+                                  v_local_count,
+                                  v_remote_mean,
+                                  v_remote_var,
+                                  v_remote_count,
+                                  constant<kFastFDiv>{});
+                }
+                else
+                {
+                    v_local_mean += v_remote_mean;
+                    v_local_var += v_remote_var;
+                }
             });
 
             mean_tensor.get_thread_buffer()(i_0) = v_local_mean;
             var_tensor.get_thread_buffer()(i_0)  = v_local_var;
-
-            count = v_local_count;
+            if(kWelford)
+                count = v_local_count;
         });
     }
 };
diff --git a/include/ck_tile/ops/welford/block/block_welford_problem.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp
similarity index 66%
rename from include/ck_tile/ops/welford/block/block_welford_problem.hpp
rename to include/ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp
index bcbfb7d76..53f5bfc6f 100644
--- a/include/ck_tile/ops/welford/block/block_welford_problem.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp
@@ -7,13 +7,18 @@
 
 namespace ck_tile {
 
-template <typename XDataType_, typename ComputeDataType_, typename BlockShape_, bool kFastFDiv_>
-struct BlockWelfordProblem
+template <typename XDataType_,
+          typename ComputeDataType_,
+          typename BlockShape_,
+          bool kFastFDiv_,
+          bool kWelford_>
+struct BlockNormReduceProblem
 {
     using XDataType                 = remove_cvref_t<XDataType_>;
     using ComputeDataType           = remove_cvref_t<ComputeDataType_>;
     using BlockShape                = remove_cvref_t<BlockShape_>;
     static constexpr bool kFastFDiv = kFastFDiv_;
+    static constexpr bool kWelford  = kWelford_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/norm_reduce/thread/thread_welford.hpp
similarity index 100%
rename from include/ck_tile/ops/welford/thread/thread_welford.hpp
rename to include/ck_tile/ops/norm_reduce/thread/thread_welford.hpp
-- 
GitLab


From 4f62f6e9b77a41ca34a68efd1297d4b68eda06d2 Mon Sep 17 00:00:00 2001
From: Mingtao Gu <145657261+mtgu0705@users.noreply.github.com>
Date: Fri, 3 Jan 2025 18:35:21 +0800
Subject: [PATCH 149/153] Implement the fp16xint4 scale weight only kernel for
 Ali (#1786)

* enable int4 scale (weight only) kernel

* format some files

* Add unit test for int4 weight only

* fixed and formatted code

* fixed

* formated

* formated

* fixed

* fixed a bug in the ckProfiler, and formatted the code

---------

Co-authored-by: mtgu0705 <mtgu@amd.com>
---
 example/01_gemm/CMakeLists.txt                |    1 +
 .../gemm_xdl_fp16_pk_i4_v3_b_scale.cpp        |  357 +++
 ..._gemm_pipeline_xdlops_b_scale_selector.hpp |  167 ++
 ...ckwise_gemm_pipeline_xdlops_v1_b_scale.hpp |  403 +++
 ...ckwise_gemm_pipeline_xdlops_v2_b_scale.hpp | 1248 ++++++++++
 ...ckwise_gemm_pipeline_xdlops_v3_b_scale.hpp |  530 ++++
 ...ckwise_gemm_pipeline_xdlops_v4_b_scale.hpp |  686 +++++
 .../gpu/device/device_gemm_v2.hpp             |   37 +
 .../device_gemm_xdl_cshuffle_v3_b_scale.hpp   |  781 ++++++
 .../element/unary_element_wise_operation.hpp  |   71 +-
 .../gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp | 2208 +++++++++++++++++
 .../threadwise_tensor_slice_transfer.hpp      |  200 ++
 include/ck/utility/amd_inline_asm.hpp         |    6 +-
 include/ck/utility/data_type.hpp              |    2 +
 .../gpu/gemm_b_scale.hpp                      |   91 +
 .../gpu/gemm_b_scale/CMakeLists.txt           |   10 +
 ...e_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp |  105 +
 ...4_f16_mk_nk_mn_mem_v2_default_instance.cpp |   32 +
 .../profiler/profile_gemm_b_scale_impl.hpp    |  448 ++++
 profiler/src/CMakeLists.txt                   |    2 +
 profiler/src/profile_gemm_b_scale.cpp         |  181 ++
 21 files changed, 7562 insertions(+), 4 deletions(-)
 create mode 100644 example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 profiler/include/profiler/profile_gemm_b_scale_impl.hpp
 create mode 100644 profiler/src/profile_gemm_b_scale.cpp

diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 354e443b3..d6df1514b 100755
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -30,6 +30,7 @@ add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_v3)
 add_example_executable(example_gemm_xdl_fp16_fp8_v3 gemm_xdl_fp16_fp8_v3.cpp)
 add_example_executable(example_gemm_xdl_fp16_pk_i4_v3 gemm_xdl_fp16_pk_i4_v3.cpp)
+add_example_executable(example_gemm_xdl_fp16_pk_i4_v3_b_scale gemm_xdl_fp16_pk_i4_v3_b_scale.cpp)
 add_example_executable(example_gemm_xdl_bf16_pk_i4_v3 gemm_xdl_bf16_pk_i4_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
 add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
new file mode 100644
index 000000000..c8a40baa8
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::pk_i4_t;
+using BScaleDataType   = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA = false;
+static constexpr bool PermuteB = true;
+
+static constexpr ck::index_t Scale_Block_N = 1;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+static constexpr ck::index_t KPerBlock = 64;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, BScaleDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        256, Scale_Block_N, Scale_Block_K,
+        128, 128,
+        KPerBlock, 8, 32,
+        32,   32,
+        4,    1,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<2, 128, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 32, 32, 0,
+        1, 1, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, CDataType, CDataType, PermuteA, PermuteB>;
+
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        AccDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BScaleDataType> b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K,
+                                                           (N + Scale_Block_N - 1) / Scale_Block_N,
+                                                           Scale_Stride_BN,
+                                                           BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 4:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 5:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int hi   = input[2];
+                int lo   = input[0];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 0, i) = i4x2;
+            }
+
+            {
+                int hi   = input[6];
+                int lo   = input[4];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 2, i) = i4x2;
+            }
+
+            {
+                int hi   = input[3];
+                int lo   = input[1];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 4, i) = i4x2;
+            }
+
+            {
+                int hi   = input[7];
+                int lo   = input[5];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 6, i) = i4x2;
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    b1_scale_device_buf.ToDevice(b1_k_n.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          StrideA,
+                          StrideB,
+                          StrideC,
+                          Scale_Stride_BN,
+                          static_cast<BScaleDataType*>(b1_scale_device_buf.GetDeviceBuffer()),
+                          KBatch,
+                          a_element_op,
+                          b_element_op,
+                          c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<float> b_k_n_dequant({K, N});
+
+        float v_b = 0;
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                ck::pk_i4_t i4x2 = b_k_n(k, n).data;
+                int8_t i4        = 0;
+                if(k % 2 == 1)
+                    i4 = (i4x2.data >> 0) & 0xf;
+                else
+                    i4 = (i4x2.data >> 4) & 0xf;
+                i4  = i4 - 8;
+                v_b = ck::type_convert<float>(i4);
+
+                b_k_n_dequant(k, n) =
+                    ck::type_convert<float>(v_b) *
+                    ck::type_convert<float>(b1_k_n(k / Scale_Block_K, n / Scale_Block_N));
+            }
+        }
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n_dequant, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp
new file mode 100644
index 000000000..ea0c511da
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp"
+
+namespace ck {
+
+enum struct BlockGemmPipelineVersion
+{
+    v1, // Naive
+    v2, // Mem
+    v3, // Comp
+    v4, // Comp, double lds buffer
+    v5, // Comp, double global prefetch register buffer
+};
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmPipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        return BlockwiseGemmXdlops_pipeline_v1_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+    {
+        return BlockwiseGemmXdlops_pipeline_v2_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        return BlockwiseGemmXdlops_pipeline_v3_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+    {
+        return BlockwiseGemmXdlops_pipeline_v4_b_scale<BlkGemmPipeSche,
+                                                       BlockSize,
+                                                       ADataType,
+                                                       BDataType,
+                                                       ComputeDataType,
+                                                       AccDataType,
+                                                       ATileDesc,
+                                                       BTileDesc,
+                                                       AMmaTileDesc,
+                                                       BMmaTileDesc,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerXDL,
+                                                       NPerXDL,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v5)
+    {
+        return BlockwiseGemmXdlops_pipeline_v5<BlkGemmPipeSche,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>{};
+    }
+    else
+    {
+        std::cerr << "BlockGemmPipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
new file mode 100644
index 000000000..4246f4a44
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_b_scale.hpp
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 1
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v1_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v1_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              // BScale Thread Copy
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop,
+        index_t num_loop_per_scale) const
+    {
+        // assume kperblock = scaleblockk
+        ignore            = num_loop_per_scale;
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_buf);
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               b_scale_thread_copy_step.At(Number<1>{}));
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                // -------------------------------------------------------------------------------------------
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                block_sync_lds();
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, I0),
+                                           a_thread_buf);
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        c_thread_buf_per_scale.Clear();
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                        });
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                            c_thread_buf(Number<c_offset>{}) +=
+                                c_thread_buf_per_scale[Number<t>{}] *
+                                type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                        });
+                    });
+                });
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                });
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       b_scale_thread_copy_step.At(Number<1>{}));
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                i += 1;
+
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_buf);
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_buf);
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    c_thread_buf_per_scale.Clear();
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                    });
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                        c_thread_buf(Number<c_offset>{}) +=
+                            c_thread_buf_per_scale[Number<t>{}] *
+                            type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
new file mode 100644
index 000000000..776f66dbb
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
@@ -0,0 +1,1248 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Maximum Global Memory throughput pipeline with >=32KB data in fly
+// GlobalPrefetchStages: >=2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v2_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t WgpPerCU =
+        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+    static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
+        32768 / WgpPerCU,
+        (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
+    static constexpr index_t PrefetchStages =
+        FullMemBandPrefetchStages >= 2
+            ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8
+            : 2;
+
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = PrefetchStages;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        if(num_loop % PrefetchStages == 1)
+        {
+            return TailNumber::One;
+        }
+        else if(num_loop % PrefetchStages == 2)
+        {
+            return TailNumber::Two;
+        }
+        else if(num_loop % PrefetchStages == 3)
+        {
+            return TailNumber::Three;
+        }
+        else if(num_loop % PrefetchStages == 4)
+        {
+            return TailNumber::Four;
+        }
+        else if(num_loop % PrefetchStages == 5)
+        {
+            return TailNumber::Five;
+        }
+        else if(num_loop % PrefetchStages == 6)
+        {
+            return TailNumber::Six;
+        }
+        else if(num_loop % PrefetchStages == 7)
+        {
+            return TailNumber::Seven;
+        }
+        else
+        {
+            return TailNumber::Full;
+        }
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+
+        // Global prefetch [2, PrefetchStages]
+        static_for<1, PrefetchStages, 1>{}([&](auto iprefetch) {
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        });
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                static_for<0, PrefetchStages, 1>{}([&](auto iprefetch) {
+                    // -------------------------------------------------------------------------------------------
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                               make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, k, I0),
+                                               a_thread_buf);
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                b_thread_copy_.Run(
+                                    b_block_desc_n0_n1_n2_k,
+                                    make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                    b_block_buf,
+                                    b_thread_desc_,
+                                    make_tuple(n0, I0, k, I0),
+                                    b_thread_buf);
+                            });
+                        });
+                    });
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(
+                        a_block_desc, a_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+                    b_blockwise_copy.RunWrite(
+                        b_block_desc, b_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                });
+
+                i += PrefetchStages;
+            } while(i < (num_loop - PrefetchStages));
+        }
+
+        // tail
+
+        auto LoopTailFunc = [&](auto tail_num) {
+            static_for<1, tail_num, 1>{}([&](auto iprefetch) {
+                block_sync_lds();
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, I0),
+                                           a_thread_buf);
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_buf);
+                        });
+                    });
+                });
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch);
+            });
+
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        };
+
+        if constexpr(TailNum == TailNumber::One)
+        {
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Two)
+        {
+            LoopTailFunc(Number<2>{});
+        }
+        else if constexpr(TailNum == TailNumber::Three)
+        {
+            LoopTailFunc(Number<3>{});
+        }
+        else if constexpr(TailNum == TailNumber::Four)
+        {
+            LoopTailFunc(Number<4>{});
+        }
+        else if constexpr(TailNum == TailNumber::Five)
+        {
+            LoopTailFunc(Number<5>{});
+        }
+        else if constexpr(TailNum == TailNumber::Six)
+        {
+            LoopTailFunc(Number<6>{});
+        }
+        else if constexpr(TailNum == TailNumber::Seven)
+        {
+            LoopTailFunc(Number<7>{});
+        }
+        else if constexpr(TailNum == TailNumber::Full)
+        {
+            LoopTailFunc(Number<PrefetchStages>{});
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Interwave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KPerThread;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    static constexpr index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
+    static constexpr index_t KPerInnerLoop  = math::max(KPerThread / NumMacClusters, KPack);
+    static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
+
+    static constexpr index_t WgpPerCU =
+        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+    static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
+        32768 / WgpPerCU,
+        (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
+    static constexpr index_t PrefetchStages =
+        FullMemBandPrefetchStages >= 2
+            ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8
+            : 2;
+
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = PrefetchStages;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        if(num_loop % PrefetchStages == 1)
+        {
+            return TailNumber::One;
+        }
+        else if(num_loop % PrefetchStages == 2)
+        {
+            return TailNumber::Two;
+        }
+        else if(num_loop % PrefetchStages == 3)
+        {
+            return TailNumber::Three;
+        }
+        else if(num_loop % PrefetchStages == 4)
+        {
+            return TailNumber::Four;
+        }
+        else if(num_loop % PrefetchStages == 5)
+        {
+            return TailNumber::Five;
+        }
+        else if(num_loop % PrefetchStages == 6)
+        {
+            return TailNumber::Six;
+        }
+        else if(num_loop % PrefetchStages == 7)
+        {
+            return TailNumber::Seven;
+        }
+        else
+        {
+            return TailNumber::Full;
+        }
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        const BScaleGridDesc& b_scale_grid_desc,
+                        // BScaleThreadCopy
+                        const BScaleThreadDesc& b_scale_thread_desc,
+                        BScaleThreadTransfer& b_scale_thread_copy,
+                        const BScaleGridBuffer& b_scale_grid_buf,
+                        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+                        // num loop
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
+    {
+        ignore = num_loop_per_scale;
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_buf);
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               b_scale_thread_copy_step.At(Number<1>{}));
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+
+        // Global prefetch [2, PrefetchStages]
+        static_for<1, PrefetchStages, 1>{}([&](auto iprefetch) {
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        });
+
+        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>(); // need?
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                static_for<0, PrefetchStages, 1>{}([&](auto iprefetch) {
+                    // -------------------------------------------------------------------------------------------
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                               make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, k0, I0),
+                                               a_thread_buf);
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                b_thread_copy_.Run(
+                                    b_block_desc_n0_n1_n2_k,
+                                    make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                    b_block_buf,
+                                    b_thread_desc_,
+                                    make_tuple(n0, I0, k0, I0),
+                                    b_thread_buf);
+                            });
+                        });
+                        __builtin_amdgcn_sched_barrier(0);
+                        // NOTE: Synchronize threads in a workgroup at the start of each MAC
+                        // cluster, but except the first, as we can shorten non-MAC cluster a bit
+                        // and there's no observable negative impact. The desired effect is waves in
+                        // a workgroup executing MAC in sync. This avoids some out-of-sync waves
+                        // hijacking MAC resource from other workgroups and reducing the chance of
+                        // latency hiding by waiting for the rest of the workgroup at the eventual
+                        // sync point.
+                        if constexpr(k0.value != 0 || KRepeat == 1)
+                        {
+                            __builtin_amdgcn_s_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
+                        }
+                        static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                    vector_type<ComputeDataType, KPack> a_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, k0, k_ + ik))>{}];
+                                    });
+
+                                    using mfma_input_type =
+                                        typename vector_type<ComputeDataType,
+                                                             xdlops_gemm.K1PerXdlops>::type;
+
+                                    constexpr index_t c_offset =
+                                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                    // The block_sync_lds() here performs double duty:
+                                    // A) safeguard against data hazard because barrier from
+                                    // blockwise_gemm is moved here B) reduce VMEM FIFO congestion
+                                    // by applying small delays to different wavefronts It is
+                                    // performed near the end of MAC cluster to minimize lgkmcnt
+                                    // penalty
+                                    if constexpr(k0.value == KRepeat - 1 &&
+                                                 k_.value == KPerInnerLoop - KPack &&
+                                                 m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                                    {
+                                        __builtin_amdgcn_sched_barrier(0);
+                                        block_sync_lds();
+                                        __builtin_amdgcn_sched_barrier(0);
+                                    }
+                                    xdlops_gemm.Run(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                    if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                                    {
+                                        __builtin_amdgcn_sched_barrier(0);
+                                        __builtin_amdgcn_s_setprio(1);
+                                        __builtin_amdgcn_sched_barrier(0);
+                                    }
+                                });
+
+                                // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t)
+                                // {
+                                //     constexpr index_t c_offset =
+                                //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                                //     c_thread_buf(Number<c_offset>{}) +=
+                                //         c_thread_buf_per_scale[Number<t>{}] *
+                                //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                                // });
+                            });
+                        });
+                        __builtin_amdgcn_sched_barrier(0);
+                        __builtin_amdgcn_s_setprio(0);
+                        __builtin_amdgcn_sched_barrier(0);
+                    });
+
+                    // static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    //         b_scale_thread_copy.Run(b_scale_grid_desc,
+                    //                                 b_scale_grid_buf,
+                    //                                 b_scale_thread_desc,
+                    //                                 make_tuple(n0, I0),
+                    //                                 b_scale_thread_buf);
+
+                    //         b_scale_thread_copy.MoveSrcSliceWindow(
+                    //         b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                    //     });
+                    // b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                    //                                    b_scale_thread_copy_step.At(Number<1>{}));
+
+                    // block_sync_lds();
+                    a_blockwise_copy.RunWrite(
+                        a_block_desc, a_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+                    b_blockwise_copy.RunWrite(
+                        b_block_desc, b_block_buf, Number<(iprefetch + 1) % PrefetchStages>{});
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                });
+                i += PrefetchStages;
+            } while(i < (num_loop - PrefetchStages));
+        }
+
+        // tail
+
+        auto LoopTailFunc = [&](auto tail_num) {
+            static_for<1, tail_num, 1>{}([&](auto iprefetch) {
+                block_sync_lds();
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k0, I0),
+                                           a_thread_buf);
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k0, I0),
+                                               b_thread_buf);
+                        });
+                    });
+
+                    __builtin_amdgcn_sched_barrier(0);
+                    if constexpr(k0.value != 0 || KRepeat == 1)
+                    {
+                        __builtin_amdgcn_s_barrier();
+                        __builtin_amdgcn_sched_barrier(0);
+                    }
+                    static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(n0, I0, k0, k_ + ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                if constexpr(k0.value == KRepeat - 1 &&
+                                             k_.value == KPerInnerLoop - KPack &&
+                                             m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                                {
+                                    __builtin_amdgcn_sched_barrier(0);
+                                    block_sync_lds();
+                                    __builtin_amdgcn_sched_barrier(0);
+                                }
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                                {
+                                    __builtin_amdgcn_sched_barrier(0);
+                                    __builtin_amdgcn_s_setprio(1);
+                                    __builtin_amdgcn_sched_barrier(0);
+                                }
+                            });
+
+                            // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            //     constexpr index_t c_offset =
+                            //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                            //     c_thread_buf(Number<c_offset>{}) +=
+                            //         c_thread_buf_per_scale[Number<t>{}] *
+                            //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                            // });
+                        });
+                    });
+                    __builtin_amdgcn_sched_barrier(0);
+                    __builtin_amdgcn_s_setprio(0);
+                    __builtin_amdgcn_sched_barrier(0);
+                });
+
+                // static_for<0, NRepeat, 1>{}([&](auto n0) {
+                //     b_scale_thread_copy.Run(b_scale_grid_desc,
+                //                             b_scale_grid_buf,
+                //                             b_scale_thread_desc,
+                //                             make_tuple(n0, I0),
+                //                             b_scale_thread_buf);
+
+                //     b_scale_thread_copy.MoveSrcSliceWindow(
+                //         b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                // });
+                // b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                //                                        b_scale_thread_copy_step.At(Number<1>{}));
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch);
+            });
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k0, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+                if constexpr(k0.value != 0 || KRepeat == 1)
+                {
+                    __builtin_amdgcn_s_barrier();
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+                static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, k_ + ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            if constexpr(k0.value == KRepeat - 1 &&
+                                         k_.value == KPerInnerLoop - KPack &&
+                                         m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                block_sync_lds();
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                __builtin_amdgcn_s_setprio(1);
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                        });
+
+                        // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        //     constexpr index_t c_offset =
+                        //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                        //     c_thread_buf(Number<c_offset>{}) +=
+                        //         c_thread_buf_per_scale[Number<t>{}] *
+                        //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                        // });
+                    });
+                });
+                __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_s_setprio(0);
+                __builtin_amdgcn_sched_barrier(0);
+            });
+        };
+
+        if constexpr(TailNum == TailNumber::One)
+        {
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k0, I0),
+                                       a_thread_buf);
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * KPerInnerLoop>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+                if constexpr(k0.value != 0 || KRepeat == 1)
+                {
+                    __builtin_amdgcn_s_barrier();
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+                static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, k_ + ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, k_ + ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            if constexpr(k0.value == KRepeat - 1 &&
+                                         k_.value == KPerInnerLoop - KPack &&
+                                         m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                block_sync_lds();
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                __builtin_amdgcn_s_setprio(1);
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                        });
+
+                        // static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        //     constexpr index_t c_offset =
+                        //         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                        //     c_thread_buf(Number<c_offset>{}) +=
+                        //         c_thread_buf_per_scale[Number<t>{}] *
+                        //         type_convert<AccDataType>(b_scale_thread_buf[n0]);
+                        // });
+                    });
+                });
+                __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_s_setprio(0);
+                __builtin_amdgcn_sched_barrier(0);
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Two)
+        {
+            LoopTailFunc(Number<2>{});
+        }
+        else if constexpr(TailNum == TailNumber::Three)
+        {
+            LoopTailFunc(Number<3>{});
+        }
+        else if constexpr(TailNum == TailNumber::Four)
+        {
+            LoopTailFunc(Number<4>{});
+        }
+        else if constexpr(TailNum == TailNumber::Five)
+        {
+            LoopTailFunc(Number<5>{});
+        }
+        else if constexpr(TailNum == TailNumber::Six)
+        {
+            LoopTailFunc(Number<6>{});
+        }
+        else if constexpr(TailNum == TailNumber::Seven)
+        {
+            LoopTailFunc(Number<7>{});
+        }
+        else if constexpr(TailNum == TailNumber::Full)
+        {
+            LoopTailFunc(Number<PrefetchStages>{});
+        }
+    }
+
+    protected:
+    // K->M loopover
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<MRepeat>{}, I1, Number<KRepeat>{}, Number<KPerInnerLoop>{}),
+        make_tuple(Number<KPerInnerLoop>{},
+                   Number<KRepeat * MRepeat * KPerInnerLoop>{},
+                   Number<MRepeat * KPerInnerLoop>{},
+                   I1));
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPerInnerLoop>{}),
+        make_tuple(Number<KPerInnerLoop>{},
+                   Number<KRepeat * NRepeat * KPerInnerLoop>{},
+                   Number<NRepeat * KPerInnerLoop>{},
+                   I1));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                                         ComputeDataType,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{Base::CalculateBThreadOriginDataIndex()};
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
new file mode 100644
index 000000000..d1be88dd6
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v3_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        // Separate this part?
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
+        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_mfma_per_issue =
+            num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
+        });
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        // BScaleThreadCopy
+                        const BScaleGridDesc& b_scale_grid_desc,
+                        const BScaleThreadDesc& b_scale_thread_desc,
+                        BScaleThreadTransfer& b_scale_thread_copy,
+                        const BScaleGridBuffer& b_scale_grid_buf,
+                        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+                        // num loop
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
+    {
+        __builtin_amdgcn_sched_barrier(0);
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // B scale buffer
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_buf);
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(num_loop_per_scale == 1)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        constexpr auto num_scale_k_block = BScaleThreadDesc{}.GetLength(I1);
+        constexpr auto num_scale_krepeat = KRepeat / num_scale_k_block;
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, Number<k0 * AMmaKStride>{}),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, k0, I0),
+                                   a_thread_buf);
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_thread_copy_.Run(
+                    b_block_desc_n0_n1_n2_k,
+                    make_tuple(n0, I0, I0, Number<k0 * BMmaKStride>{}),
+                    b_block_buf,
+                    b_scale_thread_buf[Number<n0 * num_scale_k_block + k0 / num_scale_krepeat>{}],
+                    b_thread_desc_,
+                    make_tuple(n0, I0, k0, I0),
+                    b_thread_buf);
+            });
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                block_sync_lds();
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                });
+
+                if((i + 2) % num_loop_per_scale == 0)
+                {
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<2>{}));
+                }
+                else
+                {
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, b_scale_thread_copy_step.At(Number<1>{}));
+                }
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            xdlops_gemm.Run(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+
+                block_sync_lds();
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<k0 * AMmaKStride>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k0, I0),
+                                           a_thread_buf);
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<k0 * BMmaKStride>{}),
+                                           b_block_buf,
+                                           b_scale_thread_buf[Number<n0 * num_scale_k_block +
+                                                                     k0 / num_scale_krepeat>{}],
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k0, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                HotLoopScheduler();
+                __builtin_amdgcn_sched_barrier(0);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
new file mode 100644
index 000000000..f35c7a97c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
@@ -0,0 +1,686 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimimal pipeline with highest resource request
+// GlobalPrefetchStages: 4
+// LocalPreFillStages: 2
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 2
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_v4_b_scale
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_v4_b_scale<BlockGemmPipelineScheduler::Intrawave,
+                                               BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 3;
+    static constexpr index_t PrefillStages   = 2;
+    static constexpr index_t GlobalBufferNum = 1;
+    static constexpr index_t HotloopUnroll   = 2;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        if(num_loop % HotloopUnroll == 1)
+        {
+            return TailNumber::Odd;
+        }
+        else
+        {
+            return TailNumber::Even;
+        }
+    }
+
+    __device__ static constexpr void HotLoopScheduler()
+    {
+        // TODO: Take data type into consideration as pipe ver 3
+        // A-B splited schedule
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_issue_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_dswrite_per_issue_a =
+            (HotLoopInstList::A_LDS_Write_Inst_Num + num_issue_a - 1) / num_issue_a;
+        constexpr auto num_dsread_per_issue_a = num_ds_read_inst_a / num_issue_a;
+
+        constexpr auto num_issue_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+        constexpr auto num_dswrite_per_issue_b =
+            (HotLoopInstList::B_LDS_Write_Inst_Num + num_issue_b - 1) / num_issue_b;
+        constexpr auto num_dsread_per_issue_b = num_ds_read_inst_b / num_issue_b;
+
+        constexpr auto num_mfma_per_issue =
+            HotLoopInstList::C_MFMA_Inst_Num / (num_issue_a + num_issue_b);
+
+        static_for<0, num_issue_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dsread_per_issue_a, 1>{}([&](auto idsread) {
+                ignore = idsread;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008,
+                                                 num_mfma_per_issue - num_dsread_per_issue_a -
+                                                     num_dswrite_per_issue_a,
+                                                 0); // MFMA
+        });
+
+        static_for<0, num_issue_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dsread_per_issue_b, 1>{}([&](auto idsread) {
+                ignore = idsread;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008,
+                                                 num_mfma_per_issue - num_dsread_per_issue_a -
+                                                     num_dswrite_per_issue_b,
+                                                 0); // MFMA
+        });
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        // BScaleThreadCopy
+                        const BScaleGridDesc& b_scale_grid_desc,
+                        const BScaleThreadDesc& b_scale_thread_desc,
+                        BScaleThreadTransfer& b_scale_thread_copy,
+                        const BScaleGridBuffer& b_scale_grid_buf,
+                        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+                        // num loop
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // B scale buffer
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_thread_buf), Number<2>{}> a_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_bufs(I0));
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(num_loop_per_scale == 1)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I0));
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_bufs(I1));
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(2 % num_loop_per_scale == 0)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                   a_block_buf.At(I0),
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, k, I0),
+                                   a_thread_bufs(I0));
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(I0),
+                                       b_scale_thread_bufs(I0)[n0],
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(I0));
+                });
+            });
+        });
+
+        // Local prefill 2
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I1));
+
+        // Global prefetch 3
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_bufs(I0));
+
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(3 % num_loop_per_scale == 0)
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   b_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            // This hot loop has two legacy loopover, to implement the double local buffer strategy
+            do
+            {
+                auto LoopFunc = [&](auto lds_read_buf,
+                                    auto lds_read_reg_buf,
+                                    auto lds_write_buf,
+                                    auto mfma_reg_buf) {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                               make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                               a_block_buf.At(lds_read_buf),
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, k, I0),
+                                               a_thread_bufs(lds_read_reg_buf));
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                               make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                               b_block_buf.At(lds_read_buf),
+                                               b_scale_thread_bufs(lds_read_buf)[n0],
+                                               b_thread_desc_,
+                                               make_tuple(n0, I0, k, I0),
+                                               b_thread_bufs(lds_read_reg_buf));
+                        });
+                    });
+
+                    // B scale copy
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                b_scale_grid_buf,
+                                                b_scale_thread_desc,
+                                                make_tuple(n0, I0),
+                                                b_scale_thread_bufs(lds_read_reg_buf));
+
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, b_scale_thread_copy_step.At(Number<0>{}));
+                    });
+
+                    if((i + 4 + mfma_reg_buf.value) % num_loop_per_scale == 0)
+                    {
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, b_scale_thread_copy_step.At(Number<2>{}));
+                    }
+                    else
+                    {
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, b_scale_thread_copy_step.At(Number<1>{}));
+                    }
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_bufs[mfma_reg_buf]
+                                                     [Number<a_thread_desc_.CalculateOffset(
+                                                         make_tuple(m0, I0, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                };
+
+                LoopFunc(I1, I1, I0, I0);
+                LoopFunc(I0, I0, I1, I1);
+
+                i += HotloopUnroll;
+            } while(i < (num_loop - PrefetchStages));
+        }
+
+        auto ReadWriteCompFunc = [&](auto lds_read_buf,
+                                     auto lds_read_reg_buf,
+                                     auto lds_write_buf,
+                                     auto mfma_reg_buf) {
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf.At(lds_read_buf),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_bufs(lds_read_reg_buf));
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_scale_thread_bufs(lds_read_buf)[n0],
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
+                });
+            });
+
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            HotLoopScheduler();
+        };
+
+        auto ReadCompFunc = [&](auto lds_read_buf, auto lds_read_reg_buf, auto mfma_reg_buf) {
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                       a_block_buf.At(lds_read_buf),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_bufs(lds_read_reg_buf));
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf.At(lds_read_buf),
+                                       b_scale_thread_bufs(lds_read_buf)[n0],
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_bufs(lds_read_reg_buf));
+                });
+            });
+
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            HotLoopScheduler();
+        };
+
+        auto CompFunc = [&](auto mfma_reg_buf) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg_buf][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        };
+
+        // tail
+        if constexpr(TailNum == TailNumber::Odd)
+        {
+            ReadWriteCompFunc(I1, I1, I0, I0);
+            ReadCompFunc(I0, I0, I1);
+            CompFunc(I0);
+        }
+        else if constexpr(TailNum == TailNumber::Even)
+        {
+            ReadCompFunc(I1, I1, I0);
+            CompFunc(I1);
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
index 43909f77d..78d8aa997 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
@@ -77,6 +77,43 @@ struct DeviceGemmV2R1 : public BaseOperator
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
 
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename BScaleType,
+          typename CDataType,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmV2BScale : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t StrideScaleB,
+                        const void* p_b_scale,
+                        ck::index_t KSplit,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual bool GetPermuteB()         = 0;
+    virtual ck::index_t GetKPerBlock() = 0;
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
new file mode 100644
index 000000000..044350d11
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockN, // scale block for N
+          index_t ScaleBlockK, // scale block for K
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
+                                                             BLayout,
+                                                             CLayout,
+                                                             ADataType,
+                                                             BDataType,
+                                                             BScaleDataType,
+                                                             CDataType,
+                                                             ScaleBlockN,
+                                                             ScaleBlockK,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        ScaleBlockN,
+        ScaleBlockK,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t StrideScaleB,
+                             const BScaleDataType* p_b_scale,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        StrideScaleB,
+                        p_b_scale,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t StrideScaleB,
+                                                      const void* p_b_scale,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          StrideScaleB,
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmXdlUniversal"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 86a5af41b..c0b447174 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -44,6 +44,40 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
     return res.template AsType<half4_t>()[Number<0>{}];
 }
 
+__host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half2_t& scale)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    // Extract the two int4 at low bit and create two fp16 number.
+    int lo = amd_assembly_and_or_b32(q, LO, EX);
+    // Extract the two int4 at hight bit and create two fp16 number.
+    int hi = amd_assembly_and_or_b32(q, HI, EX);
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    vector_type<half_t, 4> res;
+
+    res.template AsType<half2_t>()(Number<0>{}) =
+        amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));
+
+    res.template AsType<half2_t>()(Number<1>{}) = amd_assembly_pk_fma_f16(
+        bit_cast<half2_t>(hi), bit_cast<half2_t>(MUL), bit_cast<half2_t>(ADD));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<0>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<0>{})), "v"(scale));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<1>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<1>{})), "v"(scale));
+
+    return res.template AsType<half4_t>()[Number<0>{}];
+}
+
 __host__ __device__ inline half2_t pki4_to_half2(pk_i4_t q)
 {
 #if 1
@@ -171,7 +205,42 @@ struct PassThroughPack8
         dst.template AsType<bhalf2_t>()(Number<3>{}) =
             pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<3>{}]);
 
-        y          = dst.template AsType<bhalf8_t>()[Number<0>{}];
+        y = dst.template AsType<bhalf8_t>()[Number<0>{}];
+#endif
+    }
+    constexpr const static bool is_pack8_invocable = true;
+};
+
+struct DequantPack8
+{
+    template <typename Y, typename X, typename Z>
+    __host__ __device__ void operator()(Y& y, const X& x, const Z& z) const;
+
+    __host__ __device__ constexpr void
+    operator()(ck::half8_t& y, const ck::pk_i4x4_t& x, const ck::half2_t& z) const
+    {
+#if 1
+        vector_type<half_t, 8> result;
+
+        result.template AsType<half4_t>()(Number<0>{}) = pki4_to_half4_scale(bit_cast<int>(x), z);
+        result.template AsType<half4_t>()(Number<1>{}) =
+            pki4_to_half4_scale(bit_cast<int>(x) >> 8, z);
+
+        y = result.template AsType<half8_t>()[Number<0>{}];
+#else
+        vector_type<half_t, 8> dst;
+        vector_type<pk_i4_t, 4> src{x};
+
+        dst.template AsType<half2_t>()(Number<0>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<0>{}]);
+        dst.template AsType<half2_t>()(Number<1>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<1>{}]);
+        dst.template AsType<half2_t>()(Number<2>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<2>{}]);
+        dst.template AsType<half2_t>()(Number<3>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<3>{}]);
+
+        y          = dst.template AsType<half8_t>()[Number<0>{}];
 #endif
     }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
new file mode 100644
index 000000000..bdb24c25a
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -0,0 +1,2208 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/utility/common_header.hpp"
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
+        p_shared,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
+        p_shared_0,
+        p_shared_1,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockN, // scale N
+          index_t ScaleBlockK, // scale K
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct GridwiseGemm_xdl_cshuffle_v3
+{
+    using BScaleType = ck::half_t;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr index_t KPack =
+        math::max(math::lcm(AK1Number, BK1Number),
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(
+                           Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                               make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+                return b_grid_desc_bk0_n_bk1;
+            }
+            else
+            {
+                // Weight Tile Permute
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                // const index_t BK00     = BK0 / BK01;
+                const index_t BK0_ = StrideB / BK1Value;
+                const index_t BK00 = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MPerXdl>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+#endif
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_,
+                         index_t StrideScaleB_,
+                         index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              StrideScaleB{StrideScaleB_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "SScaleB:" << StrideScaleB << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t StrideScaleB;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_,
+                          index_t StrideScaleB_,
+                          const BScaleType* p_b_scale_grid_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_,
+                          bool is_reduce_ = false)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, StrideScaleB_, k_batch_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_},
+              is_reduce(is_reduce_)
+        {
+        }
+
+        __host__ __device__ inline bool IsReduceAdd() const
+        {
+            return (Problem::KBatch > 1) && is_reduce;
+        }
+
+        __host__ __device__ inline bool IsAtomicAdd() const
+        {
+            return (Problem::KBatch > 1) && (!is_reduce);
+        }
+
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        CDataType* p_c_grid;
+
+        const BScaleType* p_b_scale_grid;
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+        bool is_reduce;
+    };
+
+    struct SplitKBatchOffset
+    {
+
+        __device__ SplitKBatchOffset(Argument& karg)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                if constexpr(!PermuteB)
+                {
+                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                }
+            }
+
+            // Calculate B scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK) * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK);
+            }
+
+            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+
+            if(karg.IsReduceAdd())
+            {
+                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+            }
+            else
+            {
+                c_reduce_offset = 0;
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t scale_k_split_offset; // New member for scale matrix offset
+        index_t c_reduce_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
+            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
+                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_ak0_mldslayer_m_ak1,
+                make_tuple(make_pass_through_transform(AK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock + BBlockLdsExtraN>{}, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
+            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
+                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_nldslayer_n_bk1,
+                make_tuple(make_pass_through_transform(BK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+        else // RowMajor B
+        {
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
+
+            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1Number * NPerXdl * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1Number * NPerXdl * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1Number * NPerXdl * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * N1>{},
+                           Number<kfold * N0 / npair>{},
+                           Number<npair>{},
+                           BK1Number));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmPipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ADataType,
+                                BDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
+                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__
+                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BK0_N_K1,
+              typename BScaleGridDesc_BN_AK,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               const BScaleType* p_b_scale_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                               const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // B Scale buffer
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
+                                                                            sizeof(ADataType) /
+                                                                            APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // b scale
+        // static_assert(KPerBlock <= ScaleBlockK);
+        static constexpr auto mfma        = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>{};
+        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
+        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
+        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
+        static constexpr auto KPerThread  = KPerBlock / K0PerXdlops;
+
+        static constexpr auto ScaleSliceSizeN       = NXdlPerWave;
+        static constexpr auto ScaleSliceSizeK       = (KPerThread + ScaleBlockK - 1) / ScaleBlockK;
+        static constexpr auto KBlockScaleSliceSizeK = (KPerBlock + ScaleBlockK - 1) / ScaleBlockK;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        auto b_thread_offset_n =
+            get_thread_local_1d_id() % NPerXdl + (get_thread_local_1d_id() / 64) % NWaves * NPerXdl;
+        auto b_thread_offset_k = (get_thread_local_1d_id() % 64) / NPerXdl * KPerThread;
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                             BScaleType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<1, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             false>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / ScaleBlockN + b_thread_offset_n,
+                                 b_thread_offset_k / ScaleBlockK));
+
+        constexpr auto b_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(NWaves * NPerXdl, 0),
+                       make_multi_index(-NPerBlock, 0),
+                       make_multi_index(-NPerBlock, KBlockScaleSliceSizeK));
+
+        const index_t num_k_block_per_scale = (ScaleBlockK + KPerBlock - 1) / KPerBlock;
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+            a_grid_desc_ak0_m_ak1,
+            a_block_desc_ak0_m_ak1,
+            a_blockwise_copy,
+            a_grid_buf,
+            a_block_buf,
+            a_block_slice_copy_step,
+            b_grid_desc_bk0_n_bk1,
+            b_block_desc_bk0_n_bk1,
+            b_blockwise_copy,
+            b_grid_buf,
+            b_block_buf,
+            b_block_slice_copy_step,
+            c_thread_buf,
+            b_scale_grid_desc_bn_ak,
+            b_scale_thread_desc,
+            b_scale_thread_copy,
+            b_scale_grid_buf,
+            b_scale_thread_slice_copy_step,
+            num_k_block_main_loop,
+            num_k_block_per_scale);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               const BScaleType* p_b_scale_grid,
+                               void* p_shared,
+                               const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        // B Scale grid
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(problem.StrideScaleB, 1));
+
+        Run<decltype(a_grid_desc_ak0_m_ak1),
+            decltype(b_grid_desc_bk0_n_bk1),
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+            HasMainKBlockLoop,
+            CGlobalMemoryDataOperation,
+            TailNum>(p_a_grid,
+                     p_b_grid,
+                     p_c_grid,
+                     p_b_scale_grid,
+                     p_shared,
+                     problem,
+                     a_grid_desc_ak0_m_ak1,
+                     b_grid_desc_bk0_n_bk1,
+                     b_scale_grid_desc_bn_ak,
+                     c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BK0_N_K1,
+              typename BScaleGridDesc_BN_AK,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    CDataType* p_c_grid,
+                                    const BScaleType* p_b_scale_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem,
+                                    const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                                    const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                                    const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
+                                    const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // B Scale buffer
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            bit_cast<BDataType*>(static_cast<char*>(p_shared_0) +
+                                 a_block_space_size_aligned * sizeof(ADataType) / APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
+                                 a_block_space_size_aligned * sizeof(ADataType) / APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // B scale
+        static constexpr auto mfma        = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>{};
+        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
+        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
+        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
+        static constexpr auto KPerThread  = KPerBlock / K0PerXdlops;
+
+        const index_t ScaleSliceSizeN               = NXdlPerWave;
+        static constexpr auto ScaleSliceSizeK       = (KPerThread + ScaleBlockK - 1) / ScaleBlockK;
+        static constexpr auto KBlockScaleSliceSizeK = (KPerBlock + ScaleBlockK - 1) / ScaleBlockK;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        auto b_thread_offset_n =
+            get_thread_local_1d_id() % NPerXdl + (get_thread_local_1d_id() / 64) % NWaves * NPerXdl;
+        auto b_thread_offset_k = (get_thread_local_1d_id() % 64) / NPerXdl * KPerThread;
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                             BScaleType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<1, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             false>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / ScaleBlockN + b_thread_offset_n,
+                                 b_thread_offset_k / ScaleBlockK));
+
+        constexpr auto b_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(NWaves * NPerXdl, 0),
+                       make_multi_index(-NPerBlock, 0),
+                       make_multi_index(-NPerBlock, KBlockScaleSliceSizeK));
+
+        const index_t num_k_block_per_scale = (ScaleBlockK + KPerBlock - 1) / KPerBlock;
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+            a_grid_desc_ak0_m_ak1,
+            a_block_desc_ak0_m_ak1,
+            a_blockwise_copy,
+            a_grid_buf,
+            a_block_bufs,
+            a_block_slice_copy_step,
+            b_grid_desc_bk0_n_bk1,
+            b_block_desc_bk0_n_bk1,
+            b_blockwise_copy,
+            b_grid_buf,
+            b_block_bufs,
+            b_block_slice_copy_step,
+            c_thread_buf,
+
+            b_scale_grid_desc_bn_ak,
+            b_scale_thread_desc,
+            b_scale_thread_copy,
+            b_scale_grid_buf,
+            b_scale_thread_slice_copy_step,
+
+            num_k_block_main_loop,
+            num_k_block_per_scale);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared_0),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    CDataType* p_c_grid,
+                                    const BScaleType* p_b_scale_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(problem.StrideScaleB, 1));
+
+        Run_2Lds<decltype(a_grid_desc_ak0_m_ak1),
+                 decltype(b_grid_desc_bk0_n_bk1),
+                 decltype(b_scale_grid_desc_bn_ak),
+                 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                 HasMainKBlockLoop,
+                 CGlobalMemoryDataOperation,
+                 TailNum>(p_a_grid,
+                          p_b_grid,
+                          p_c_grid,
+                          p_b_scale_grid,
+                          p_shared_0,
+                          p_shared_1,
+                          problem,
+                          a_grid_desc_ak0_m_ak1,
+                          b_grid_desc_bk0_n_bk1,
+                          b_scale_grid_desc_bn_ak,
+                          c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 758900200..8c65ef32a 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1222,6 +1222,206 @@ struct ThreadwiseTensorSliceTransfer_v4
         });
     }
 
+    // Fuse scale
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstData& scale,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value &&
+                is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<SrcRefToOriginDisplacement>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<DstOriginIdx>>::value,
+                      "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+                      "at compile-time");
+
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
+        constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
+
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+
+        // scalar per access of each dim
+        constexpr auto src_scalar_per_access = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<SrcScalarPerVector>{};
+                }
+                else
+                {
+                    return Number<1>{};
+                }
+            },
+            Number<nDim>{});
+
+        // scalar step (if steping on SrcVectorDim) of each dim
+        constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<1>{};
+                }
+                else
+                {
+                    return Number<0>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+#if 0
+            // TODO: unable to compile
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
+                src_scalar_per_access;
+#else
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
+#endif
+            // src coordinate
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
+
+            auto src_data_coord = src_ref_coord_;
+
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector / PackedSize> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+
+            // copy data from src_buf into src_tmp_vector
+            if constexpr(SrcBuffer::IsDynamicBuffer())
+            {
+                src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                    src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / PackedSize,
+                                                       is_src_valid);
+            }
+            else if constexpr(SrcBuffer::IsStaticBuffer())
+            {
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t src_offset = src_desc.CalculateOffset(
+                        src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
+                        i * src_scalar_step_in_vector);
+
+                    src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
+                });
+            }
+
+            if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
+            {
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+                vector_type<DstData, 2> scale_vector;
+                scale_vector.template AsType<DstData>()(Number<0>{}) = scale;
+                scale_vector.template AsType<DstData>()(Number<1>{}) = scale;
+
+                constexpr index_t pack_size = 8;
+
+                static_assert(SrcScalarPerVector % pack_size == 0, "");
+
+                using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
+                using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
+                using scale_v_t = typename vector_type_maker_t<DstData, 2>::type;
+
+                static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
+                    ck::tensor_operation::element_wise::DequantPack8{}(
+                        dst_tmp_vector.template AsType<dst_v_t>()(i),
+                        src_tmp_vector.template AsType<src_v_t>()[i],
+                        scale_vector.template AsType<scale_v_t>()[Number<0>{}]);
+                });
+
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
+            }
+            else if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
+                              is_same<remove_cvref_t<DstData>, half_t>::value &&
+                              SrcScalarPerVector % 2 == 0)
+            {
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+
+                constexpr index_t pack_size = 2;
+
+                using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
+                using src_v_t = typename vector_type_maker_t<SrcData, pack_size>::type;
+                static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
+                    ck::tensor_operation::element_wise::PassThroughPack2{}(
+                        dst_tmp_vector.template AsType<dst_v_t>()(i),
+                        src_tmp_vector.template AsType<src_v_t>()[i]);
+                });
+
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
+            }
+            else
+            {
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+
+                // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    dst_tmp_vector.template AsType<DstData>()(i) =
+                        type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
+                });
+
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
+            }
+        });
+    }
+
     template <typename SrcSliceMoveStepIdx>
     __device__ void MoveSrcSliceWindow(const SrcDesc&,
                                        const SrcSliceMoveStepIdx& src_slice_move_step_idx)
diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp
index 6761c08f2..113f3af4a 100644
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -4,8 +4,8 @@
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP
 
-#include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"
+#include "data_type.hpp"
 
 // TODO: deprecate all amd_assembly_outer_product_xxx
 
@@ -21,14 +21,14 @@ inline __device__ int amd_assembly_and_or_b32(int a, int b, int d)
 inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c)
 {
     half2_t d;
-    asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
+    asm volatile("v_pk_fma_f16 %0, %1, %2, %3" : "=v"(d) : "v"(a), "v"(b), "v"(c));
     return d;
 }
 
 inline __device__ half2_t amd_assembly_pk_add_f16(half2_t a, half2_t b)
 {
     half2_t c;
-    asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+    asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(c) : "v"(a), "v"(b));
     return c;
 }
 
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 86bc3c394..94608f5dc 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -19,6 +19,8 @@ struct pk_i4_t
     type data;
     __host__ __device__ constexpr pk_i4_t() : data{type{}} {}
     __host__ __device__ constexpr pk_i4_t(type init) : data{init} {}
+
+    __host__ __device__ constexpr operator float() const { return static_cast<int8_t>(data); }
 };
 
 inline constexpr auto next_pow2(uint32_t x)
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
new file mode 100644
index 000000000..93eed31bc
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include <memory>
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8))
+void add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BScale<Row,
+                                                   Col,
+                                                   Row,
+                                                   F16,
+                                                   I4,
+                                                   F16,
+                                                   F16,
+                                                   1,
+                                                   128,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   PassThrough>>>& instances);
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          index_t ScaleBlockK>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmV2BScale<
+    ALayout,
+    BLayout,
+    CLayout,
+    ADataType,
+    BDataType,
+    BScaleDataType,
+    CDataType,
+    1,
+    ScaleBlockK,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemmV2BScale<ALayout,
+                                        BLayout,
+                                        CLayout,
+                                        ADataType,
+                                        BDataType,
+                                        BScaleDataType,
+                                        CDataType,
+                                        1,
+                                        ScaleBlockK,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt
new file mode 100644
index 000000000..424320fa8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt
@@ -0,0 +1,10 @@
+# ONLY XDL_KERNELS
+set(GEMM_B_SCALE_INSTANCES)
+
+list(APPEND GEMM_B_SCALE_INSTANCES 
+        device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+        )
+
+set_source_files_properties(device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+add_instance_library(device_gemm_b_scale_instance ${GEMM_B_SCALE_INSTANCES})
\ No newline at end of file
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
new file mode 100644
index 000000000..52735e9df
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4  = pk_i4_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+#if 0
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_b_scale_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple<
+
+#endif
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |   Type|      |        |         |   Operation|   Operation|   Operation|              |      |     N|     K|      |      |      |    |    |Wave| Wave|     |     | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |       |      |        |         |            |            |            |              |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+        
+        //Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,   128,   8,   32,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,   128,   8,   32,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+ 
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        
+        //Latency friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        
+        // Memory friendly v3
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,   128,   32,   128,   8,   32,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,   128,   16,   128,   8,   16,  16,   16,    4,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   32,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   64,   128,   8,   32,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   64,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,  128,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+
+        // Memory friendly v4
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   32,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    64,   16,   128,   8,   16,  16,   16,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    1,   128,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   32,   128,   8,   32,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,   64,   128,   8,   32,  16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,   64,   128,   8,   32,  32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    16,  128,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    1,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>,
+
+        //new Compute friendly kernel
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,   128,   64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,   128,   64,   8,   32,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<2, 128, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>,
+
+        //new Memory friendly kernel
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   16,    64,   256,   8,   32,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 000000000..18788a2a1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BScale<Row,
+                                                   Col,
+                                                   Row,
+                                                   F16,
+                                                   I4,
+                                                   F16,
+                                                   F16,
+                                                   1,
+                                                   128,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
new file mode 100644
index 000000000..d01d48892
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename CDataType,
+          index_t ScaleBlockK,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_gemm_b_scale_impl(int do_verification,
+                               int init_method,
+                               bool do_log,
+                               bool time_kernel,
+                               int M,
+                               int N,
+                               int K,
+                               int StrideA,
+                               int StrideB,
+                               int StrideC,
+                               int KBatch,
+                               int n_warmup,
+                               int n_iter,
+                               uint64_t rotating = 0)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    ck::index_t Scale_Stride_BN = ck::is_same_v<BLayout, ck::tensor_layout::gemm::ColumnMajor>
+                                      ? ((K + ScaleBlockK - 1) / ScaleBlockK)
+                                      : N;
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BScaleDataType> b1_k_n(f_host_tensor_descriptor(
+        (K + ScaleBlockK - 1) / ScaleBlockK, // K direction group size is ScaleBlockK
+        N,                                   // N direction group size is 1
+        Scale_Stride_BN,
+        BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() +
+                            b_k_n.GetElementSpaceSizeInBytes() +
+                            b1_k_n.GetElementSpaceSizeInBytes();
+
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmV2BScale<ALayout,
+                                                                      BLayout,
+                                                                      CLayout,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      BScaleDataType,
+                                                                      CDataType,
+                                                                      1,
+                                                                      ScaleBlockK,
+                                                                      AElementOp,
+                                                                      BElementOp,
+                                                                      CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        Tensor<float> b_k_n_dequant({K, N});
+
+        float v_b = 0;
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                ck::pk_i4_t i4x2 = b_k_n(k, n).data;
+                int8_t i4        = 0;
+                if(k % 2 == 1)
+                    i4 = (i4x2.data >> 0) & 0xf;
+                else
+                    i4 = (i4x2.data >> 4) & 0xf;
+                i4  = i4 - 8;
+                v_b = ck::type_convert<float>(i4);
+
+                b_k_n_dequant(k, n) = ck::type_convert<float>(v_b) *
+                                      ck::type_convert<float>(b1_k_n(k / ScaleBlockK, n));
+            }
+        }
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp,
+                                                                                ComputeDataType>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n_dequant, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        const int KPerBlock = op_ptr->GetKPerBlock();
+
+        if(op_ptr->GetPermuteB())
+        {
+            int K1 = KPerBlock;
+            int K0 = K / KPerBlock;
+
+            // int K0, N, K1
+            for(int j = 0; j < K0; j++)
+            {
+                for(int i = 0; i < N; i++)
+                {
+                    for(int jj = 0; jj < K1; jj++)
+                    {
+                        b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                    }
+                }
+            }
+
+            if(is_same_v<BDataType, pk_i4_t> && is_same_v<ADataType, half_t>)
+            {
+                // vector pk_i4x4 permute
+                for(int i = 0; i < N; i++)
+                {
+                    for(int j = 0; j < K; j += 8)
+                    {
+                        int input[8];
+
+                        for(int k = 0; k < 4; k++)
+                        {
+                            int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                            input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                            input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                        }
+
+                        // permute 01234567->20643175
+                        {
+                            int hi   = input[2];
+                            int lo   = input[0];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 0, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[6];
+                            int lo   = input[4];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 2, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[3];
+                            int lo   = input[1];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 4, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[7];
+                            int lo   = input[5];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 6, i) = i4x2;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            b_k_n_permute = b_k_n;
+        }
+
+        b_device_buf.ToDevice(b_k_n_permute.mData.data());
+
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
+
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }
+
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                M,
+                N,
+                K,
+                StrideA,
+                StrideB,
+                StrideC,
+                Scale_Stride_BN,
+                static_cast<BScaleDataType*>(b1_device_buf.GetDeviceBuffer()),
+                kbatch_curr,
+                a_element_op,
+                b_element_op,
+                c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+                if(do_verification)
+                {
+                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+#if defined CK_ENABLE_FP8
+                    // set softer tolerances for fp8
+                    if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                                 is_same_v<CDataType, f8_t>)
+                    {
+                        std::string msg = "Error: Incorrect results!";
+                        double rtol     = 1e-1;
+                        double atol     = 1e-1;
+                        pass            = pass & ck::utils::check_err(
+                                          c_m_n_device_result, c_m_n_host_result, msg, rtol, atol);
+                    }
+                    else
+                    {
+#endif
+                        pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+#if defined CK_ENABLE_FP8
+                    }
+#endif
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<int8_t>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
+                }
+
+                std::string op_name = op_ptr->GetTypeString();
+
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});
+
+                std::size_t flop = std::size_t(2) * M * N * K;
+
+                static constexpr index_t BPackedSize = []() {
+                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                        return 2;
+                    else
+                        return 1;
+                }();
+
+                std::size_t num_btype = sizeof(ADataType) * M * K +
+                                        sizeof(BDataType) * K * N / BPackedSize +
+                                        sizeof(CDataType) * M * N;
+
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;
+
+                if(tflops > best_tflops && ave_time > 1e-10)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_kbatch     = kbatch_curr;
+                }
+            }
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index a0978eb6b..61017d4b3 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -58,6 +58,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
@@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   endif()
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
diff --git a/profiler/src/profile_gemm_b_scale.cpp b/profiler/src/profile_gemm_b_scale.cpp
new file mode 100644
index 000000000..443ebff83
--- /dev/null
+++ b/profiler/src/profile_gemm_b_scale.cpp
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+
+#include "profiler/profile_gemm_b_scale_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F16_F16,     // 4
+    F16_F8_F16,     // 5
+    F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
+    F16_I4_F16,     // 8
+};
+
+enum struct BScaleBlockTile
+{
+    K_64,  // 0
+    K_128, // 1
+};
+
+#define OP_NAME "gemm_b_scale"
+#define OP_DESC "Int4-dequant GEMM"
+
+int profile_gemm_b_scale(int argc, char* argv[])
+{
+    if(argc != 16 && argc != 19)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
+               "f16->f8; 7: f8->bf16, "
+               "comp f8; 8: f16@i4)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: B scale block tile (0: 64, 1: 128):\n");
+        printf("arg5: verification (0: no; 1: yes)\n");
+        printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg7: print tensor value (0: no; 1: yes)\n");
+        printf("arg8: time kernel (0=no, 1=yes)\n");
+        printf("arg9 to 14: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg15: split k into  mulitiple batch\n");
+        printf("optional:\n");
+        printf("arg16: number of warm-up cycles (default 1)\n");
+        printf("arg17: number of iterations (default 10)\n");
+        printf("arg18: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+
+    printf("Start profiling\n");
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const auto B_scale_block   = static_cast<BScaleBlockTile>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+
+    const int M = std::stoi(argv[9]);
+    const int N = std::stoi(argv[10]);
+    const int K = std::stoi(argv[11]);
+
+    const int StrideA = std::stoi(argv[12]);
+    const int StrideB = std::stoi(argv[13]);
+    const int StrideC = std::stoi(argv[14]);
+    const int KBatch  = std::stoi(argv[15]);
+    printf("M:%d, N:%d, K:%d, StrideA:%d, StrideB:%d, StrideC:%d, KBatch:%d\n",
+           M,
+           N,
+           K,
+           StrideA,
+           StrideB,
+           StrideC,
+           KBatch);
+
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 19)
+    {
+        n_warmup = std::stoi(argv[16]);
+        n_iter   = std::stoi(argv[17]);
+        rotating = std::stoull(argv[18]) * 1024 * 1024;
+
+        printf("n_warmup:%d, n_iter:%d, rotating:%lu\n", n_warmup, n_iter, rotating);
+    }
+
+    using F32 = float;
+    using F16 = ck::half_t;
+    using I4  = ck::pk_i4_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto b_scale_type,
+                       auto comp_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto scale_block_k,
+                       auto a_layout,
+                       auto b_layout,
+                       auto c_layout) {
+        using ADataType       = decltype(a_type);
+        using BDataType       = decltype(b_type);
+        using BScaleDataType  = decltype(b_scale_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using CDataType       = decltype(c_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_b_scale_impl<ADataType,
+                                                            BDataType,
+                                                            BScaleDataType,
+                                                            ComputeDataType,
+                                                            AccDataType,
+                                                            CDataType,
+                                                            scale_block_k,
+                                                            ALayout,
+                                                            BLayout,
+                                                            CLayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideC < 0) ? DefaultStrideC : StrideC,
+            KBatch,
+            n_warmup,
+            n_iter,
+            rotating);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN &&
+       B_scale_block == BScaleBlockTile::K_128)
+    {
+        printf("F16_I4_F16 MK_NK_MN K_128\n");
+        return profile(
+            F16{}, I4{}, F16{}, F16{}, F32{}, F16{}, ck::Number<128>{}, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_b_scale);
-- 
GitLab


From 6df5fe2ad8fb6ff054a3e75250ccef7c878c3455 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Fri, 3 Jan 2025 18:43:07 +0800
Subject: [PATCH 150/153] [CK_TILE]naive attn support FP8 KVCache quant (#1747)

* quant

* fix bug

* simple smoothquant after softmax

* update kv-quant

* update stride

* fix fp8-pertoken-kvcache

* update int8/fp8 quant support

---------

Co-authored-by: so <a.com>
Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 example/ck_tile/01_fmha/fmha_fwd.cpp    |  19 +-
 include/ck_tile/ref/naive_attention.hpp | 422 ++++++++++++++++--------
 2 files changed, 301 insertions(+), 140 deletions(-)

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 08d263da9..b3855e59d 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1131,15 +1131,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
     {
         // NOTE: use gpu to do validation
         ck_tile::naive_attention_fwd_traits naive_t;
-        naive_t.q_type    = data_type;
-        naive_t.k_type    = data_type;
-        naive_t.v_type    = data_type;
-        naive_t.o_type    = data_type;
-        naive_t.q_layout  = i_perm == 1 ? "bhsd" : "bshd";
-        naive_t.k_layout  = i_perm == 1 ? "bhsd" : "bshd";
-        naive_t.v_layout  = i_perm == 1 ? "bhsd" : "bshd";
-        naive_t.o_layout  = o_perm == 1 ? "bhsd" : "bshd";
-        naive_t.variation = 0; // TODO?
+        naive_t.q_type     = data_type;
+        naive_t.k_type     = data_type;
+        naive_t.v_type     = data_type;
+        naive_t.o_type     = data_type;
+        naive_t.q_layout   = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.k_layout   = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.v_layout   = i_perm == 1 ? "bhsd" : "bshd";
+        naive_t.o_layout   = o_perm == 1 ? "bhsd" : "bshd";
+        naive_t.variation  = 0; // TODO?
+        naive_t.quant_algo = 0;
 
         ck_tile::DeviceMem o_naive_buf(o_host.get_element_space_size_in_bytes());
 
diff --git a/include/ck_tile/ref/naive_attention.hpp b/include/ck_tile/ref/naive_attention.hpp
index 09ded761e..98ceab699 100644
--- a/include/ck_tile/ref/naive_attention.hpp
+++ b/include/ck_tile/ref/naive_attention.hpp
@@ -13,13 +13,18 @@ namespace ck_tile {
 
 enum class naive_attention_layout_enum
 {
-    BSHD,  // [batch, seqlen, nhead, hdim]
-    BHSD,  // [batch, nhead, seqlen, hdim]
-    BS3HD, // [batch, nhead, 3, seqlen, hdim], used when qkv are packed
-    PHSD,  // [pages, nhead, page_size, hdim]
+    DEFAULT, // maybe this tensor is not used, set some irrelevant value
+    BSHD,    // [batch, seqlen, nhead, hdim]
+    BHSD,    // [batch, nhead, seqlen, hdim]
+    BS3HD,   // [batch, nhead, 3, seqlen, hdim], used when qkv are packed
+    PHSD,    // [pages, nhead, page_size, hdim]
     // PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen
     PHDSX, // [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen
     PHDS,  // [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen
+
+    // scale layout used for dynamic dequant
+    SCALE_HS, // [nhead, tokens] or [nhead, tokens-per-group], nhe KVCache quant
+    SCALE_SH, // [tokens, nhead]
 };
 
 // will used to specialize kernel variation
@@ -30,6 +35,15 @@ enum class naive_attention_variation_enum
     DECODE_PAGED, // decode attn, where kv token from another buffer called kvcache
 };
 
+enum class naive_attention_quant_algo
+{
+    NO              = 0,
+    KV_8BIT_PERHEAD = 1,
+    // FP8/INT8 quant for KVCache, per-token quant
+    // [num_tokens, nhead, hdim] -> [nhead, num_tokens]
+    KV_8BIT_PERTOKEN = 2,
+};
+
 // TODO: for simplicity, this will be used as host/device arg
 struct naive_attention_fwd_args
 {
@@ -40,7 +54,8 @@ struct naive_attention_fwd_args
     void* context_len_ptr; // [batch] used when seqlen kv come from a pointer(each element is a
                            // number, not cumsum)
     void* page_table_ptr;  // [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn)
-    void* kvscale_ptr;     // [nhead, 2(kv), hdim] used for kvcache dequant
+    void* kscale_ptr;      // [nhead, max_kv_tokens] used for kvcache dequant
+    void* vscale_ptr;      // [nhead, max_kv_tokens] used for kvcache dequant
     float scale_s;
     int hdim;
     int hdim_v; // could be cross-attn, where V and Q/K hdim are different
@@ -54,6 +69,7 @@ struct naive_attention_fwd_args
     int nhead_ratio_kv; // nhead_q / nhead_kv
     int page_size;      // if paged, the seqlen-kv per each block
     int max_pages_per_seq;
+    int max_kv_tokens; // used as stride to access kv scale ptr
 };
 
 // this is trait for host API
@@ -67,14 +83,16 @@ struct naive_attention_fwd_traits
     std::string k_layout;
     std::string v_layout;
     std::string o_layout;
-    int variation; // sync with naive_attention_variation_enum
+    int variation;  // sync with naive_attention_variation_enum
+    int quant_algo; // sync with naive_attention_quant_algo
 };
 
 // this is trait for kernel template
-template <naive_attention_variation_enum variation_>
+template <naive_attention_variation_enum variation_, naive_attention_quant_algo quant_algo_>
 struct naive_attention_fwd_kernel_traits
 {
     static constexpr naive_attention_variation_enum variation = variation_;
+    static constexpr naive_attention_quant_algo quant_algo    = quant_algo_;
 };
 
 // for simplicity, please do not use const-reference type for the template type
@@ -83,28 +101,39 @@ template <typename QType,
           typename VType,
           typename OType,
           typename AccType,
+          typename KVScaleType,
           naive_attention_layout_enum QLayout,
           naive_attention_layout_enum KLayout,
           naive_attention_layout_enum VLayout,
           naive_attention_layout_enum OLayout,
+          naive_attention_layout_enum KScaleLayout,
+          naive_attention_layout_enum VScaleLayout,
           typename Traits>
 struct naive_attention_fwd_kernel
 {
     static constexpr bool is_kvcache_i8 =
-        std::is_same_v<KType, int8_t> && std::is_same_v<VType, int8_t> && sizeof(QType) != 1;
+        std::is_same_v<KType, int8_t> && std::is_same_v<VType, int8_t>;
+    static constexpr bool is_kvcache_fp8 =
+        std::is_same_v<KType, fp8_t> && std::is_same_v<VType, fp8_t>;
 
-    // kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original
-    // K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32
-    static constexpr bool is_kvcache_i8_forward_quant = is_kvcache_i8;
+    static constexpr int v_per_token_quant_group_size = 64;
 
     // TODO: hardcode
-    using KVScaleType = float;
-    using SoftmaxType = float;
-    using PType       = VType; // src A of gemm2, same type as V
+    using SoftmaxType      = float; // always using float to do softmax compute
+    using QuantComputeType = float; // used for quant/dequant scale compute
+    using QCompute         = KType; // src A of gemm1, same type as K
+    using PType            = VType; // src A of gemm2, same type as V
+    using OAccType         = float; // always float, in case int8 FA
 
     using p_vec_type                = ext_vector_t<PType, 16 / sizeof(PType)>;
     static constexpr int p_vec_elem = vector_traits<p_vec_type>::vector_size;
 
+    // clang-format off
+    template <typename T_> struct scale_max { static constexpr float value = 1; /* dummy code */ };
+    template <> struct scale_max<int8_t> { static constexpr float value = 127.0; };
+    template <> struct scale_max<fp8_t> { static constexpr float value = 240.0; };
+    // clang-format on
+
     __host__ __device__ naive_attention_fwd_kernel() {}
 
     template <typename T, naive_attention_layout_enum Layout>
@@ -198,24 +227,31 @@ struct naive_attention_fwd_kernel
         __device__ void store(T /*value*/, int /*i_s*/, int /*i_d*/) {}
     };
 
-    template <typename T>
+    template <typename T, naive_attention_layout_enum Layout>
     struct kvscale_addresser
     {
-        int h, d; // nhead, hdim
+        int s, h, d; // seqlen(tokens), nhead, hdim
         T* base_ptr;
-        __device__ kvscale_addresser(int h_, int d_, void* p_)
-            : h(h_), d(d_), base_ptr(reinterpret_cast<T*>(p_))
+        __device__ kvscale_addresser(int s_, int h_, int d_, void* p_)
+            : s(s_), h(h_), d(d_), base_ptr(reinterpret_cast<T*>(p_))
         {
         }
-        __device__ int get_offset(int i_h, int i_d, int i_kv /*0 or 1*/)
+        __device__ int get_offset(int i_s, int i_h, int i_d)
         {
+            if constexpr(Layout == naive_attention_layout_enum::SCALE_HS)
+            {
+                // [nhead, tokens]
+                (void)i_d;
+                return i_h * s + i_s;
+            }
+            else if constexpr(Layout == naive_attention_layout_enum::DEFAULT)
+            {
+                return 0;
+            }
             // [h, 2, d]
-            return i_h * 2 * d + i_kv * d + i_d;
-        }
-        __device__ T load(int i_h, int i_d, int i_kv)
-        {
-            return base_ptr[get_offset(i_h, i_d, i_kv)];
+            // return i_h * 2 * d + i_kv * d + i_d;
         }
+        __device__ T load(int i_s, int i_h, int i_d) { return base_ptr[get_offset(i_s, i_h, i_d)]; }
     };
 
     __device__ __host__ static constexpr int get_block_size() { return 256; }
@@ -282,12 +318,13 @@ struct naive_attention_fwd_kernel
     __device__ void operator()(naive_attention_fwd_args args)
     {
         constexpr int wg_size = get_block_size();
-        __shared__ char smem[wg_size * 4 * sizeof(float)]; //  should enough
-        int i_dv    = blockIdx.x * wg_size + threadIdx.x;  // index of hdim_v
-        int i_sq    = blockIdx.y;                          // index of seqlen_q
-        int i_batch = blockIdx.z;                          // index of batch_q * nhead_q
-        int i_bq    = i_batch / args.nhead_q;              // index of batch_q
-        int i_hq    = i_batch % args.nhead_q;              // index of nhead_q
+        __shared__ char smem[wg_size * 4 * sizeof(float)];       //  should enough
+        char* smem_quant_q = smem + wg_size * 2 * sizeof(float); // second half, should enough
+        int i_dv           = blockIdx.x * wg_size + threadIdx.x; // index of hdim_v
+        int i_sq           = blockIdx.y;                         // index of seqlen_q
+        int i_batch        = blockIdx.z;                         // index of batch_q * nhead_q
+        int i_bq           = i_batch / args.nhead_q;             // index of batch_q
+        int i_hq           = i_batch % args.nhead_q;             // index of nhead_q
 
         int i_bk = i_bq / args.batch_ratio_kv;
         int i_hk = i_hq / args.nhead_ratio_kv;
@@ -360,9 +397,10 @@ struct naive_attention_fwd_kernel
         auto f_max        = [](auto x_, auto y_) { return max(x_, y_); };
         auto f_sum        = [](auto x_, auto y_) { return x_ + y_; };
         auto f_absmax_f32 = [](float v_0_, float v_1_) {
-            float rtn;
-            asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_));
-            return rtn;
+            // float rtn;
+            // asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_));
+            // return rtn;
+            return max(abs(v_0_), abs(v_1_));
         };
 
         int seqlen_kv = [&]() {
@@ -378,45 +416,82 @@ struct naive_attention_fwd_kernel
 
         SoftmaxType row_max = -numeric<SoftmaxType>::infinity();
         SoftmaxType l{0};
-        AccType o_acc = {0};
+        // AccType o_acc = {0};
+        OAccType o_acc = {0};
 
-        int sk_loops   = (seqlen_kv + wg_size - 1) / wg_size;
-        float qf_scale = .0f;
-        kvscale_addresser<KVScaleType> kvscale_addr{args.nhead_kv, args.hdim, args.kvscale_ptr};
+        int sk_loops                     = (seqlen_kv + wg_size - 1) / wg_size;
+        QuantComputeType q_dequant_scale = .0f;
+        kvscale_addresser<KVScaleType, KScaleLayout> kscale_addr{
+            args.max_kv_tokens, args.nhead_kv, args.hdim, args.kscale_ptr};
+        kvscale_addresser<KVScaleType, VScaleLayout> vscale_addr{
+            args.max_kv_tokens, args.nhead_kv, args.hdim_v, args.vscale_ptr};
 
-        if constexpr(is_kvcache_i8_forward_quant)
+        if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD)
         {
             // AccType is i32 now, seqlen_q = 1, hdim up to 256
-            float q   = 0;
-            float k_s = 0;
+            AccType q   = 0;
+            AccType k_s = 0;
             if(static_cast<int>(threadIdx.x) < args.hdim)
             {
-                q   = type_convert<float>(q_addr.load(0, threadIdx.x));
-                k_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 0));
+                q   = type_convert<AccType>(q_addr.load(0, threadIdx.x));
+                k_s = type_convert<AccType>(kscale_addr.load(i_hk, threadIdx.x, 0));
             }
             // 1) we apply the k scale to q
-            float q_forwarded = q * k_s;
+            AccType q_forwarded = q * k_s;
 
             // 2) apply smooth-quant
             // find absmax
-            float qf_max = wave_reduce(q_forwarded, f_absmax_f32);
-            qf_max       = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+            AccType qf_max = wave_reduce(q_forwarded, f_absmax_f32);
+            qf_max = cross_wave_reduce(qf_max, f_absmax_f32, reinterpret_cast<AccType*>(smem));
 
             // per-token scale
-            qf_scale = qf_max / 127.0;
+            q_dequant_scale = type_convert<QuantComputeType>(qf_max) / scale_max<QCompute>::value;
 
             // devide by scale
-            q = q / qf_scale;
+            q = q / q_dequant_scale;
 
             // fp32->i8
-            int8_t quantized_q = static_cast<int8_t>(q);
+            QCompute quantized_q = static_cast<QCompute>(q);
             __syncthreads();
-            reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_q;
+            reinterpret_cast<QCompute*>(smem)[threadIdx.x] = quantized_q;
             __syncthreads();
 
             // after above process, we have 2 data
             // 1) int8 q data stored in smem(no need to reload)
-            // 2) per-token scale qf_scale, to be mul after 1st gemm
+            // 2) per-token scale q_dequant_scale, to be mul after 1st gemm
+        }
+        else if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERTOKEN)
+        {
+            if(std::is_same_v<QType, fp16_t> || std::is_same_v<QType, bf16_t>)
+            {
+                // dyanmic quant q here
+                float q = 0;
+                if(static_cast<int>(threadIdx.x) < args.hdim)
+                {
+                    q = type_convert<float>(q_addr.load(i_sq, threadIdx.x));
+                }
+
+                // apply smooth-quant
+                // find absmax
+                float q_max = wave_reduce(q, f_absmax_f32);
+                q_max = cross_wave_reduce(q_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+
+                // per-token scale
+                q_dequant_scale =
+                    type_convert<QuantComputeType>(q_max) / scale_max<QCompute>::value;
+
+                // devide by scale
+                q = q / q_dequant_scale;
+
+                QCompute quantized_q = type_convert<QCompute>(q);
+                __syncthreads();
+                reinterpret_cast<QCompute*>(smem_quant_q)[threadIdx.x] = quantized_q;
+                __syncthreads();
+
+                // after above process, we have 2 data
+                // 1) fp8 q data stored in smem(no need to reload from global)
+                // 2) per-token scale q_dequant_scale, to be mul after 1st gemm
+            }
         }
 
         for(int i_loop1 = 0; i_loop1 < sk_loops; i_loop1++)
@@ -429,33 +504,41 @@ struct naive_attention_fwd_kernel
                 AccType s_acc{0}; // clear for every loop
                 for(auto i_dq = 0; i_dq < args.hdim; i_dq++)
                 {
-                    if constexpr(is_kvcache_i8_forward_quant)
-                    {
-                        int8_t q = reinterpret_cast<int8_t*>(smem)[i_dq];
-                        auto k   = k_addr.load(i_sk, i_dq);
-
-                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
-                    }
-                    else
-                    {
-                        auto q = q_addr.load(i_sq, i_dq); // q will have duplicate load
-                        auto k = k_addr.load(i_sk, i_dq);
+                    auto q = [&]() {
+                        if constexpr(Traits::quant_algo ==
+                                         naive_attention_quant_algo::KV_8BIT_PERHEAD ||
+                                     Traits::quant_algo ==
+                                         naive_attention_quant_algo::KV_8BIT_PERTOKEN)
+                        {
+                            return reinterpret_cast<QCompute*>(smem_quant_q)[i_dq];
+                        }
+                        else
+                            return q_addr.load(i_sq, i_dq); // q will have duplicate load
+                    }();
+                    auto k = [&]() { return k_addr.load(i_sk, i_dq); }();
 
-                        s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
-                    }
+                    s_acc += type_convert<AccType>(q) * type_convert<AccType>(k);
                 }
                 // scale
                 s_softmax = type_convert<SoftmaxType>(s_acc);
                 s_softmax *=
                     type_convert<SoftmaxType>(args.scale_s * ck_tile::log2e_v<SoftmaxType>);
-                if constexpr(is_kvcache_i8_forward_quant)
+                if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD)
+                {
+                    s_softmax *= q_dequant_scale; // post scale the per-token factor
+                }
+                else if constexpr(Traits::quant_algo ==
+                                  naive_attention_quant_algo::KV_8BIT_PERTOKEN)
                 {
-                    s_softmax *= qf_scale; // post scale the per-token factor
+                    SoftmaxType k_per_token_scale =
+                        type_convert<SoftmaxType>(kscale_addr.load(i_sk, i_hk, 0));
+                    s_softmax *= q_dequant_scale;
+                    s_softmax *= k_per_token_scale;
                 }
             }
 
             // s->p
-            float pf_scale = 0.; // used for i8 quant
+            QuantComputeType p_dequant_scale = 1.;
             {
                 // softmax, find max
                 SoftmaxType old_max = row_max;
@@ -473,41 +556,69 @@ struct naive_attention_fwd_kernel
                 // l, pre-scall o_acc
                 SoftmaxType tmp = __builtin_amdgcn_exp2f(old_max - row_max);
                 l               = tmp * l + row_sum;
-                o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+                o_acc           = type_convert<OAccType>(type_convert<SoftmaxType>(o_acc) * tmp);
 
                 // prepare the p_compute into smem, to let every thread read same p_compute and do
                 // 2nd gemm
-                if constexpr(is_kvcache_i8_forward_quant)
+                if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD)
                 {
-                    float v_s = 0;
+                    QuantComputeType v_s = 0;
                     if(static_cast<int>(threadIdx.x) < args.hdim_v)
                     {
-                        v_s = type_convert<float>(kvscale_addr.load(i_hk, threadIdx.x, 1));
+                        v_s =
+                            type_convert<QuantComputeType>(vscale_addr.load(i_hk, threadIdx.x, 1));
                     }
 
                     // 1) we apply the v scale to p
-                    float p_forwarded = p_compute * v_s;
+                    QuantComputeType p_forwarded = p_compute * v_s;
 
                     // 2) apply smooth-quant
                     // find absmax
-                    float pf_max = wave_reduce(p_forwarded, f_absmax_f32);
-                    pf_max =
-                        cross_wave_reduce(pf_max, f_absmax_f32, reinterpret_cast<float*>(smem));
+                    QuantComputeType pf_max = wave_reduce(p_forwarded, f_absmax_f32);
+                    pf_max                  = cross_wave_reduce(
+                        pf_max, f_absmax_f32, reinterpret_cast<QuantComputeType*>(smem));
 
                     // per-token scale
-                    pf_scale = pf_max / 127.0;
+                    p_dequant_scale = pf_max / scale_max<PType>::value; // 127.0;
 
                     // devide by scale
-                    p_compute = p_compute / pf_scale;
+                    p_compute = p_compute / p_dequant_scale;
 
                     // fp32->i8
-                    int8_t quantized_p = static_cast<int8_t>(p_compute);
+                    PType quantized_p = static_cast<PType>(p_compute);
                     __syncthreads();
-                    reinterpret_cast<int8_t*>(smem)[threadIdx.x] = quantized_p;
+                    reinterpret_cast<PType*>(smem)[threadIdx.x] = quantized_p;
                     __syncthreads();
                     // after above process, we have 2 data
                     // 1) int8 p data stored in smem(no need to reload)
-                    // 2) per-token scale pf_scale, to be mul after 2nd gemm
+                    // 2) per-token scale p_dequant_scale, to be mul after 2nd gemm
+                }
+                else if constexpr(Traits::quant_algo ==
+                                  naive_attention_quant_algo::KV_8BIT_PERTOKEN)
+                {
+                    // forward apply the v scale to p_compute, this is compute friendly
+                    auto v_scale = type_convert<QuantComputeType>(vscale_addr.load(i_sk, i_hk, 0));
+                    p_compute *= v_scale;
+                    // smooth-quant
+                    // find absmax
+                    QuantComputeType p_max = wave_reduce(p_compute, f_absmax_f32);
+                    p_max                  = cross_wave_reduce(
+                        p_max, f_absmax_f32, reinterpret_cast<QuantComputeType*>(smem));
+
+                    // per-token scale
+                    p_dequant_scale = p_max / scale_max<PType>::value; // 240.0;
+
+                    // devide by scale
+                    p_compute = p_compute / p_dequant_scale;
+
+                    // fp32->i8
+                    PType quantized_p = type_convert<PType>(p_compute);
+                    __syncthreads();
+                    reinterpret_cast<PType*>(smem)[threadIdx.x] = quantized_p;
+                    __syncthreads();
+                    // after above process, we have 2 data
+                    // 1) fp8_t p data stored in smem(no need to reload)
+                    // 2) per-token scale p_dequant_scale, to be mul after 2nd gemm
                 }
                 else
                 {
@@ -531,29 +642,45 @@ struct naive_attention_fwd_kernel
                         int sv_offset = i_loop2 * p_vec_elem + i_j;
                         int i_sv      = sk_start + sv_offset;
 
-                        VType v = 0.f;
+                        VType v = 0;
                         if(i_dv < args.hdim_v && i_sv < seqlen_kv)
                         {
                             v = v_addr.load(i_sv, i_dv);
                         }
 
-                        o_acc_local += type_convert<AccType>(p_vec[i_j]) * type_convert<AccType>(v);
+                        AccType v_compute = [&]() { return type_convert<AccType>(v); }();
+
+                        o_acc_local += type_convert<AccType>(p_vec[i_j]) * v_compute;
                     }
                 }
-                if constexpr(is_kvcache_i8_forward_quant)
-                {
-                    // apply pr scale to local acc
-                    o_acc_local =
-                        type_convert<AccType>(type_convert<float>(o_acc_local) * pf_scale);
-                }
-                o_acc += o_acc_local;
+
+                OAccType post_scale_o_acc_local = [&]() {
+                    if constexpr(Traits::quant_algo == naive_attention_quant_algo::KV_8BIT_PERHEAD)
+                    {
+                        // apply pr scale to local acc
+                        return type_convert<OAccType>(type_convert<QuantComputeType>(o_acc_local) *
+                                                      p_dequant_scale);
+                    }
+                    else if constexpr(Traits::quant_algo ==
+                                      naive_attention_quant_algo::KV_8BIT_PERTOKEN)
+                    {
+                        // apply pr scale to local acc
+                        return type_convert<OAccType>(type_convert<QuantComputeType>(o_acc_local) *
+                                                      p_dequant_scale);
+                    }
+                    else
+                    {
+                        return type_convert<OAccType>(o_acc_local);
+                    }
+                }();
+                o_acc += post_scale_o_acc_local;
             }
         }
 
         // post scale o_acc
         {
             SoftmaxType tmp = l == 0.f ? 0.f : 1.f / l; // in case masking
-            o_acc           = type_convert<AccType>(type_convert<SoftmaxType>(o_acc) * tmp);
+            o_acc           = type_convert<OAccType>(type_convert<SoftmaxType>(o_acc) * tmp);
         }
 
         // store O
@@ -564,18 +691,21 @@ struct naive_attention_fwd_kernel
 
 #define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_()                                                        \
     {                                                                                                       \
-        using ktraits_ =                                                                                    \
-            naive_attention_fwd_kernel_traits<static_cast<naive_attention_variation_enum>(                  \
-                variation_)>;                                                                               \
+        using ktraits_ = naive_attention_fwd_kernel_traits<                                                 \
+            static_cast<naive_attention_variation_enum>(variation_),                                        \
+            static_cast<naive_attention_quant_algo>(quant_algo_)>;                                          \
         using k_   = naive_attention_fwd_kernel<q_type_,                                                    \
                                               k_type_,                                                    \
                                               v_type_,                                                    \
                                               o_type_,                                                    \
                                               acc_type_,                                                  \
+                                              kvscale_type_,                                              \
                                               q_layout_,                                                  \
                                               k_layout_,                                                  \
                                               v_layout_,                                                  \
                                               o_layout_,                                                  \
+                                              k_scale_layout_,                                            \
+                                              v_scale_layout_,                                            \
                                               ktraits_>;                                                  \
         dim3 grids = k_::get_grid_size(a);                                                                  \
         r          = ck_tile::launch_kernel(s,                                                              \
@@ -586,31 +716,37 @@ struct naive_attention_fwd_kernel
     if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \
        t.o_layout == "bshd")                                                                       \
     {                                                                                              \
-        constexpr auto q_layout_ = naive_attention_layout_enum::BSHD;                              \
-        constexpr auto k_layout_ = naive_attention_layout_enum::BSHD;                              \
-        constexpr auto v_layout_ = naive_attention_layout_enum::BSHD;                              \
-        constexpr auto o_layout_ = naive_attention_layout_enum::BSHD;                              \
-        constexpr int variation_ = 0;                                                              \
+        constexpr auto q_layout_       = naive_attention_layout_enum::BSHD;                        \
+        constexpr auto k_layout_       = naive_attention_layout_enum::BSHD;                        \
+        constexpr auto v_layout_       = naive_attention_layout_enum::BSHD;                        \
+        constexpr auto o_layout_       = naive_attention_layout_enum::BSHD;                        \
+        constexpr auto k_scale_layout_ = naive_attention_layout_enum::DEFAULT;                     \
+        constexpr auto v_scale_layout_ = naive_attention_layout_enum::DEFAULT;                     \
+        constexpr int variation_       = 0;                                                        \
         CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
     }                                                                                              \
     else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" &&                    \
             t.v_layout == "bhsd" && t.o_layout == "bhsd")                                          \
     {                                                                                              \
-        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
-        constexpr auto k_layout_ = naive_attention_layout_enum::BHSD;                              \
-        constexpr auto v_layout_ = naive_attention_layout_enum::BHSD;                              \
-        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
-        constexpr int variation_ = 0;                                                              \
+        constexpr auto q_layout_       = naive_attention_layout_enum::BHSD;                        \
+        constexpr auto k_layout_       = naive_attention_layout_enum::BHSD;                        \
+        constexpr auto v_layout_       = naive_attention_layout_enum::BHSD;                        \
+        constexpr auto o_layout_       = naive_attention_layout_enum::BHSD;                        \
+        constexpr auto k_scale_layout_ = naive_attention_layout_enum::DEFAULT;                     \
+        constexpr auto v_scale_layout_ = naive_attention_layout_enum::DEFAULT;                     \
+        constexpr int variation_       = 0;                                                        \
         CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
     }                                                                                              \
     else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" &&                   \
             t.v_layout == "phds" && t.o_layout == "bhsd")                                          \
     {                                                                                              \
-        constexpr auto q_layout_ = naive_attention_layout_enum::BHSD;                              \
-        constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX;                             \
-        constexpr auto v_layout_ = naive_attention_layout_enum::PHDS;                              \
-        constexpr auto o_layout_ = naive_attention_layout_enum::BHSD;                              \
-        constexpr int variation_ = 2;                                                              \
+        constexpr auto q_layout_       = naive_attention_layout_enum::BHSD;                        \
+        constexpr auto k_layout_       = naive_attention_layout_enum::PHDSX;                       \
+        constexpr auto v_layout_       = naive_attention_layout_enum::PHDS;                        \
+        constexpr auto o_layout_       = naive_attention_layout_enum::BHSD;                        \
+        constexpr auto k_scale_layout_ = naive_attention_layout_enum::SCALE_HS;                    \
+        constexpr auto v_scale_layout_ = naive_attention_layout_enum::SCALE_HS;                    \
+        constexpr int variation_       = 2;                                                        \
         CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_();                                              \
     }
 
@@ -621,40 +757,64 @@ CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t,
 {
     float r = -1;
     // TODO: do not explicitly create too much instance!
-    if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16")
+    if(t.q_type == "fp16" && t.k_type == "fp16" && t.v_type == "fp16" && t.o_type == "fp16" &&
+       t.quant_algo == 0)
+    {
+        using q_type_             = fp16_t;
+        using k_type_             = fp16_t;
+        using v_type_             = fp16_t;
+        using o_type_             = fp16_t;
+        using acc_type_           = float;
+        using kvscale_type_       = float;
+        constexpr int quant_algo_ = 0;
+        CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
+    }
+    else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16" &&
+            t.quant_algo == 0)
     {
-        using q_type_   = fp16_t;
-        using k_type_   = fp16_t;
-        using v_type_   = fp16_t;
-        using o_type_   = fp16_t;
-        using acc_type_ = float;
+        using q_type_             = bf16_t;
+        using k_type_             = bf16_t;
+        using v_type_             = bf16_t;
+        using o_type_             = bf16_t;
+        using acc_type_           = float;
+        using kvscale_type_       = float;
+        constexpr int quant_algo_ = 0;
         CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
     }
-    else if(t.q_type == "bf16" && t.k_type == "bf16" && t.v_type == "bf16" && t.o_type == "bf16")
+    else if(t.q_type == "bf16" && t.k_type == "fp8" && t.v_type == "fp8" && t.o_type == "bf16" &&
+            t.quant_algo == 2)
     {
-        using q_type_   = bf16_t;
-        using k_type_   = bf16_t;
-        using v_type_   = bf16_t;
-        using o_type_   = bf16_t;
-        using acc_type_ = float;
+        using q_type_             = bf16_t;
+        using k_type_             = fp8_t;
+        using v_type_             = fp8_t;
+        using o_type_             = bf16_t;
+        using acc_type_           = float; // NOTE!
+        using kvscale_type_       = float;
+        constexpr int quant_algo_ = 2;
         CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
     }
-    else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16")
+    else if(t.q_type == "fp16" && t.k_type == "fp8" && t.v_type == "fp8" && t.o_type == "fp16" &&
+            t.quant_algo == 2)
     {
-        using q_type_   = bf16_t;
-        using k_type_   = int8_t;
-        using v_type_   = int8_t;
-        using o_type_   = bf16_t;
-        using acc_type_ = int32_t; // NOTE!
+        using q_type_             = fp16_t;
+        using k_type_             = fp8_t;
+        using v_type_             = fp8_t;
+        using o_type_             = fp16_t;
+        using acc_type_           = float; // NOTE!
+        using kvscale_type_       = float;
+        constexpr int quant_algo_ = 2;
         CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
     }
-    else if(t.q_type == "fp16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "fp16")
+    else if(t.q_type == "bf16" && t.k_type == "int8" && t.v_type == "int8" && t.o_type == "bf16" &&
+            t.quant_algo == 2)
     {
-        using q_type_   = fp16_t;
-        using k_type_   = int8_t;
-        using v_type_   = int8_t;
-        using o_type_   = fp16_t;
-        using acc_type_ = int32_t; // NOTE!
+        using q_type_             = bf16_t;
+        using k_type_             = int8_t;
+        using v_type_             = int8_t;
+        using o_type_             = bf16_t;
+        using acc_type_           = int32_t; // NOTE!
+        using kvscale_type_       = float;
+        constexpr int quant_algo_ = 2;
         CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_();
     }
     return r;
-- 
GitLab


From 8ea375bb58243b943918d3673434fd13a59d5a01 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 3 Jan 2025 16:38:22 -0800
Subject: [PATCH 151/153] terminology clean-up (#1792)

---
 .../gpu/thread/threadwise_tensor_slice_transfer.hpp           | 4 ++--
 include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 8c65ef32a..bb1871ae6 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1544,7 +1544,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic
     ElementwiseOperation element_op_;
 };
 
-// Specilized for WMMA-Navi3
+// Specialized for gfx11
 // A single Wave32 is composed by double row
 // Data exchange allowed between these two rows
 // This RowLane Dst buf will be filled from two Src buf
@@ -1679,7 +1679,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
     ElementwiseOperation element_op_{};
 };
 
-// Specilized for WMMA-Navi4
+// Specialized for gfx12
 template <typename SrcData,
           typename DstData,
           typename SrcDesc,
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index b435a2a12..1abae56be 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -307,7 +307,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
 
     // Wave mode dependent propety
     static constexpr index_t wave_size = Number<WaveSize>{};
-    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
+    // * Fixed for gfx11, Will be wave mode dependent on gfx12
     // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
     // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
     // * num_acc_vgprs_per_wave alone M direction
-- 
GitLab


From 37b35146482a69189928320ea06a77f3e3109c9e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 3 Jan 2025 17:47:48 -0800
Subject: [PATCH 152/153] Bump rocm-docs-core from 1.12.0 to 1.12.1 in
 /docs/sphinx (#1788)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.12.0 to 1.12.1.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.12.0...v1.12.1)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 46a61a87f..2c7961c37 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.12.0
+rocm-docs-core==1.12.1
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index c2e74baae..3b84d1477 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.12.0
+rocm-docs-core==1.12.1
     # via -r requirements.in
 six==1.16.0
     # via pybtex
-- 
GitLab


From 888317e698e9803c62bd38568abc9e05d7709f33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Sat, 4 Jan 2025 14:01:33 +0100
Subject: [PATCH 153/153] Fix universal gemm profiler for pk_i4_t (#1790)

* Fix universal gemm profiler for pk_i4_t

* fix
---
 include/ck/library/utility/host_tensor.hpp        | 13 +++++++++++--
 include/ck/utility/type_convert.hpp               | 15 ++++++++++++++-
 .../profiler/profile_gemm_universal_impl.hpp      |  6 +++---
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index ef5738be0..f1730de0e 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -44,10 +44,19 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
         else
             os << delim;
 
-        if constexpr(std::is_same_v<T, ck::f8_t> || std::is_same_v<T, ck::bf8_t>)
+        using RangeType = ck::remove_cvref_t<decltype(v)>;
+        if constexpr(std::is_same_v<RangeType, ck::f8_t> || std::is_same_v<RangeType, ck::bf8_t> ||
+                     std::is_same_v<RangeType, ck::bhalf_t>)
         {
             os << ck::type_convert<float>(v);
         }
+        else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t>)
+        {
+            const auto packed_floats = ck::type_convert<ck::float2_t>(v);
+            const ck::vector_type<float, 2> vector_of_floats{packed_floats};
+            os << vector_of_floats.template AsType<float>()[ck::Number<0>{}] << delim
+               << vector_of_floats.template AsType<float>()[ck::Number<1>{}];
+        }
         else
         {
             os << static_cast<T>(v);
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index f372756e6..9120ce62c 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -465,6 +465,19 @@ inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_
 #endif
 }
 
+template <>
+inline __host__ __device__ float2_t type_convert<float2_t, pk_i4_t>(pk_i4_t x)
+{
+    uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
+    uint8_t x_l  = (x_u8 & 0x0f) >> 0;
+    uint8_t x_h  = (x_u8 & 0xf0) >> 4;
+
+    auto l_f32 = ck::type_convert<float>(x_l);
+    auto h_f32 = ck::type_convert<float>(x_h);
+
+    return {l_f32, h_f32};
+}
+
 template <>
 inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
 {
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index ed7e86ded..2054ffbbb 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -177,7 +177,7 @@ bool profile_gemm_universal_impl(int do_verification,
                 }
             }
 
-            if(is_same_v<BDataType, pk_i4_t> && is_same_v<ADataType, half_t>)
+            if constexpr(is_same_v<BDataType, pk_i4_t> && is_same_v<ADataType, half_t>)
             {
                 // vector pk_i4x4 permute
                 for(int i = 0; i < N; i++)
@@ -188,7 +188,7 @@ bool profile_gemm_universal_impl(int do_verification,
 
                         for(int k = 0; k < 4; k++)
                         {
-                            int i4x2         = b_k_n_permute(j + k * 2, i);
+                            int i4x2         = b_k_n_permute(j + k * 2, i).data;
                             input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
                             input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
                         }
-- 
GitLab